1 // SPDX-License-Identifier: GPL-2.0
2 #include <trace/syscall.h>
3 #include <trace/events/syscalls.h>
4 #include <linux/kernel_stat.h>
5 #include <linux/syscalls.h>
6 #include <linux/slab.h>
7 #include <linux/kernel.h>
8 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
9 #include <linux/ftrace.h>
10 #include <linux/perf_event.h>
11 #include <linux/xarray.h>
12 #include <asm/syscall.h>
13
14 #include "trace_output.h"
15 #include "trace.h"
16
17 static DEFINE_MUTEX(syscall_trace_lock);
18
19 static int syscall_enter_register(struct trace_event_call *event,
20 enum trace_reg type, void *data);
21 static int syscall_exit_register(struct trace_event_call *event,
22 enum trace_reg type, void *data);
23
24 static struct list_head *
syscall_get_enter_fields(struct trace_event_call * call)25 syscall_get_enter_fields(struct trace_event_call *call)
26 {
27 struct syscall_metadata *entry = call->data;
28
29 return &entry->enter_fields;
30 }
31
32 extern struct syscall_metadata *__start_syscalls_metadata[];
33 extern struct syscall_metadata *__stop_syscalls_metadata[];
34
35 static DEFINE_XARRAY(syscalls_metadata_sparse);
36 static struct syscall_metadata **syscalls_metadata;
37
38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
arch_syscall_match_sym_name(const char * sym,const char * name)39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
40 {
41 /*
42 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch.
46 */
47 return !strcmp(sym + 3, name + 3);
48 }
49 #endif
50
51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52 /*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67 static int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69 {
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74 }
75 #else
76 static inline int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78 {
79 return syscall_get_nr(task, regs);
80 }
81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
83 static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)84 find_syscall_meta(unsigned long syscall)
85 {
86 struct syscall_metadata **start;
87 struct syscall_metadata **stop;
88 char str[KSYM_SYMBOL_LEN];
89
90
91 start = __start_syscalls_metadata;
92 stop = __stop_syscalls_metadata;
93 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
94
95 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
96 return NULL;
97
98 for ( ; start < stop; start++) {
99 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
100 return *start;
101 }
102 return NULL;
103 }
104
syscall_nr_to_meta(int nr)105 static struct syscall_metadata *syscall_nr_to_meta(int nr)
106 {
107 if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
108 return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
109
110 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
111 return NULL;
112
113 return syscalls_metadata[nr];
114 }
115
get_syscall_name(int syscall)116 const char *get_syscall_name(int syscall)
117 {
118 struct syscall_metadata *entry;
119
120 entry = syscall_nr_to_meta(syscall);
121 if (!entry)
122 return NULL;
123
124 return entry->name;
125 }
126
127 /* Added to user strings or arrays when max limit is reached */
128 #define EXTRA "..."
129
get_dynamic_len_ptr(struct syscall_trace_enter * trace,struct syscall_metadata * entry,int * offset_p,int * len_p,unsigned char ** ptr_p)130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
131 struct syscall_metadata *entry,
132 int *offset_p, int *len_p, unsigned char **ptr_p)
133 {
134 unsigned char *ptr;
135 int offset = *offset_p;
136 int val;
137
138 /* This arg points to a user space string */
139 ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
140 val = *(int *)ptr;
141
142 /* The value is a dynamic string (len << 16 | offset) */
143 ptr = (void *)trace + (val & 0xffff);
144 *len_p = val >> 16;
145 offset += 4;
146
147 *ptr_p = ptr;
148 *offset_p = offset;
149 }
150
151 static enum print_line_t
sys_enter_openat_print(struct syscall_trace_enter * trace,struct syscall_metadata * entry,struct trace_seq * s,struct trace_event * event)152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
153 struct trace_seq *s, struct trace_event *event)
154 {
155 unsigned char *ptr;
156 int offset = 0;
157 int bits, len;
158 bool done = false;
159 static const struct trace_print_flags __flags[] =
160 {
161 { O_TMPFILE, "O_TMPFILE" },
162 { O_WRONLY, "O_WRONLY" },
163 { O_RDWR, "O_RDWR" },
164 { O_CREAT, "O_CREAT" },
165 { O_EXCL, "O_EXCL" },
166 { O_NOCTTY, "O_NOCTTY" },
167 { O_TRUNC, "O_TRUNC" },
168 { O_APPEND, "O_APPEND" },
169 { O_NONBLOCK, "O_NONBLOCK" },
170 { O_DSYNC, "O_DSYNC" },
171 { O_DIRECT, "O_DIRECT" },
172 { O_LARGEFILE, "O_LARGEFILE" },
173 { O_DIRECTORY, "O_DIRECTORY" },
174 { O_NOFOLLOW, "O_NOFOLLOW" },
175 { O_NOATIME, "O_NOATIME" },
176 { O_CLOEXEC, "O_CLOEXEC" },
177 };
178
179 trace_seq_printf(s, "%s(", entry->name);
180
181 for (int i = 0; !done && i < entry->nb_args; i++) {
182
183 if (trace_seq_has_overflowed(s))
184 goto end;
185
186 if (i)
187 trace_seq_puts(s, ", ");
188
189 switch (i) {
190 case 2:
191 bits = trace->args[2];
192
193 trace_seq_puts(s, "flags: ");
194
195 /* No need to show mode when not creating the file */
196 if (!(bits & (O_CREAT|O_TMPFILE)))
197 done = true;
198
199 if (!(bits & O_ACCMODE)) {
200 if (!bits) {
201 trace_seq_puts(s, "O_RDONLY");
202 continue;
203 }
204 trace_seq_puts(s, "O_RDONLY|");
205 }
206
207 trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags));
208 /*
209 * trace_print_flags_seq() adds a '\0' to the
210 * buffer, but this needs to append more to the seq.
211 */
212 if (!trace_seq_has_overflowed(s))
213 trace_seq_pop(s);
214
215 continue;
216 case 3:
217 trace_seq_printf(s, "%s: 0%03o", entry->args[i],
218 (unsigned int)trace->args[i]);
219 continue;
220 }
221
222 trace_seq_printf(s, "%s: %lu", entry->args[i],
223 trace->args[i]);
224
225 if (!(BIT(i) & entry->user_mask))
226 continue;
227
228 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
229 trace_seq_printf(s, " \"%.*s\"", len, ptr);
230 }
231
232 trace_seq_putc(s, ')');
233 end:
234 trace_seq_putc(s, '\n');
235
236 return trace_handle_return(s);
237 }
238
239 static enum print_line_t
print_syscall_enter(struct trace_iterator * iter,int flags,struct trace_event * event)240 print_syscall_enter(struct trace_iterator *iter, int flags,
241 struct trace_event *event)
242 {
243 struct trace_array *tr = iter->tr;
244 struct trace_seq *s = &iter->seq;
245 struct trace_entry *ent = iter->ent;
246 struct syscall_trace_enter *trace;
247 struct syscall_metadata *entry;
248 int i, syscall, val, len;
249 unsigned char *ptr;
250 int offset = 0;
251
252 trace = (typeof(trace))ent;
253 syscall = trace->nr;
254 entry = syscall_nr_to_meta(syscall);
255
256 if (!entry)
257 goto end;
258
259 if (entry->enter_event->event.type != ent->type) {
260 WARN_ON_ONCE(1);
261 goto end;
262 }
263
264 switch (entry->syscall_nr) {
265 case __NR_openat:
266 if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
267 return sys_enter_openat_print(trace, entry, s, event);
268 break;
269 default:
270 break;
271 }
272
273 trace_seq_printf(s, "%s(", entry->name);
274
275 for (i = 0; i < entry->nb_args; i++) {
276 bool printable = false;
277 char *str;
278
279 if (trace_seq_has_overflowed(s))
280 goto end;
281
282 if (i)
283 trace_seq_puts(s, ", ");
284
285 /* parameter types */
286 if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
287 trace_seq_printf(s, "%s ", entry->types[i]);
288
289 /* parameter values */
290 if (trace->args[i] < 10)
291 trace_seq_printf(s, "%s: %lu", entry->args[i],
292 trace->args[i]);
293 else
294 trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
295 trace->args[i]);
296
297 if (!(BIT(i) & entry->user_mask))
298 continue;
299
300 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
301
302 if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
303 trace_seq_printf(s, " \"%.*s\"", len, ptr);
304 continue;
305 }
306
307 val = trace->args[entry->user_arg_size];
308
309 str = ptr;
310 trace_seq_puts(s, " (");
311 for (int x = 0; x < len; x++, ptr++) {
312 if (isascii(*ptr) && isprint(*ptr))
313 printable = true;
314 if (x)
315 trace_seq_putc(s, ':');
316 trace_seq_printf(s, "%02x", *ptr);
317 }
318 if (len < val)
319 trace_seq_printf(s, ", %s", EXTRA);
320
321 trace_seq_putc(s, ')');
322
323 /* If nothing is printable, don't bother printing anything */
324 if (!printable)
325 continue;
326
327 trace_seq_puts(s, " \"");
328 for (int x = 0; x < len; x++) {
329 if (isascii(str[x]) && isprint(str[x]))
330 trace_seq_putc(s, str[x]);
331 else
332 trace_seq_putc(s, '.');
333 }
334 if (len < val)
335 trace_seq_printf(s, "\"%s", EXTRA);
336 else
337 trace_seq_putc(s, '"');
338 }
339
340 trace_seq_putc(s, ')');
341 end:
342 trace_seq_putc(s, '\n');
343
344 return trace_handle_return(s);
345 }
346
347 static enum print_line_t
print_syscall_exit(struct trace_iterator * iter,int flags,struct trace_event * event)348 print_syscall_exit(struct trace_iterator *iter, int flags,
349 struct trace_event *event)
350 {
351 struct trace_seq *s = &iter->seq;
352 struct trace_entry *ent = iter->ent;
353 struct syscall_trace_exit *trace;
354 int syscall;
355 struct syscall_metadata *entry;
356
357 trace = (typeof(trace))ent;
358 syscall = trace->nr;
359 entry = syscall_nr_to_meta(syscall);
360
361 if (!entry) {
362 trace_seq_putc(s, '\n');
363 goto out;
364 }
365
366 if (entry->exit_event->event.type != ent->type) {
367 WARN_ON_ONCE(1);
368 return TRACE_TYPE_UNHANDLED;
369 }
370
371 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
372 trace->ret);
373
374 out:
375 return trace_handle_return(s);
376 }
377
378 #define SYSCALL_FIELD(_type, _name) { \
379 .type = #_type, .name = #_name, \
380 .size = sizeof(_type), .align = __alignof__(_type), \
381 .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
382
383 /* When len=0, we just calculate the needed length */
384 #define LEN_OR_ZERO (len ? len - pos : 0)
385
386 static int __init
sys_enter_openat_print_fmt(struct syscall_metadata * entry,char * buf,int len)387 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
388 {
389 int pos = 0;
390
391 pos += snprintf(buf + pos, LEN_OR_ZERO,
392 "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
393 pos += snprintf(buf + pos, LEN_OR_ZERO,
394 " ((unsigned long)(REC->dfd)),");
395 pos += snprintf(buf + pos, LEN_OR_ZERO,
396 " ((unsigned long)(REC->filename)),");
397 pos += snprintf(buf + pos, LEN_OR_ZERO,
398 " __get_str(__filename_val),");
399 pos += snprintf(buf + pos, LEN_OR_ZERO,
400 " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
401 pos += snprintf(buf + pos, LEN_OR_ZERO,
402 " REC->flags ? __print_flags(REC->flags, \"|\", ");
403 pos += snprintf(buf + pos, LEN_OR_ZERO,
404 "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
405 pos += snprintf(buf + pos, LEN_OR_ZERO,
406 "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
407 pos += snprintf(buf + pos, LEN_OR_ZERO,
408 "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
409 pos += snprintf(buf + pos, LEN_OR_ZERO,
410 "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
411 pos += snprintf(buf + pos, LEN_OR_ZERO,
412 "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
413 pos += snprintf(buf + pos, LEN_OR_ZERO,
414 "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
415 pos += snprintf(buf + pos, LEN_OR_ZERO,
416 "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
417 pos += snprintf(buf + pos, LEN_OR_ZERO,
418 "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
419 pos += snprintf(buf + pos, LEN_OR_ZERO,
420 "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
421 pos += snprintf(buf + pos, LEN_OR_ZERO,
422 "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
423 pos += snprintf(buf + pos, LEN_OR_ZERO,
424 "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
425 pos += snprintf(buf + pos, LEN_OR_ZERO,
426 "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
427 pos += snprintf(buf + pos, LEN_OR_ZERO,
428 "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
429 pos += snprintf(buf + pos, LEN_OR_ZERO,
430 "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
431 pos += snprintf(buf + pos, LEN_OR_ZERO,
432 "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
433
434 pos += snprintf(buf + pos, LEN_OR_ZERO,
435 " ((unsigned long)(REC->mode))");
436 return pos;
437 }
438
439 static int __init
__set_enter_print_fmt(struct syscall_metadata * entry,char * buf,int len)440 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
441 {
442 bool is_string = entry->user_arg_is_str;
443 int i;
444 int pos = 0;
445
446 switch (entry->syscall_nr) {
447 case __NR_openat:
448 return sys_enter_openat_print_fmt(entry, buf, len);
449 default:
450 break;
451 }
452
453 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
454 for (i = 0; i < entry->nb_args; i++) {
455 if (i)
456 pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
457 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
458 entry->args[i], sizeof(unsigned long));
459
460 if (!(BIT(i) & entry->user_mask))
461 continue;
462
463 /* Add the format for the user space string or array */
464 if (entry->user_arg_size < 0 || is_string)
465 pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
466 else
467 pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
468 }
469 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
470
471 for (i = 0; i < entry->nb_args; i++) {
472 pos += snprintf(buf + pos, LEN_OR_ZERO,
473 ", ((unsigned long)(REC->%s))", entry->args[i]);
474 if (!(BIT(i) & entry->user_mask))
475 continue;
476 /* The user space data for arg has name __<arg>_val */
477 if (entry->user_arg_size < 0 || is_string) {
478 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
479 entry->args[i]);
480 } else {
481 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
482 entry->args[i]);
483 }
484 }
485
486 #undef LEN_OR_ZERO
487
488 /* return the length of print_fmt */
489 return pos;
490 }
491
set_syscall_print_fmt(struct trace_event_call * call)492 static int __init set_syscall_print_fmt(struct trace_event_call *call)
493 {
494 char *print_fmt;
495 int len;
496 struct syscall_metadata *entry = call->data;
497
498 if (entry->enter_event != call) {
499 call->print_fmt = "\"0x%lx\", REC->ret";
500 return 0;
501 }
502
503 /* First: called with 0 length to calculate the needed length */
504 len = __set_enter_print_fmt(entry, NULL, 0);
505
506 print_fmt = kmalloc(len + 1, GFP_KERNEL);
507 if (!print_fmt)
508 return -ENOMEM;
509
510 /* Second: actually write the @print_fmt */
511 __set_enter_print_fmt(entry, print_fmt, len + 1);
512 call->print_fmt = print_fmt;
513
514 return 0;
515 }
516
free_syscall_print_fmt(struct trace_event_call * call)517 static void __init free_syscall_print_fmt(struct trace_event_call *call)
518 {
519 struct syscall_metadata *entry = call->data;
520
521 if (entry->enter_event == call)
522 kfree(call->print_fmt);
523 }
524
syscall_enter_define_fields(struct trace_event_call * call)525 static int __init syscall_enter_define_fields(struct trace_event_call *call)
526 {
527 struct syscall_trace_enter trace;
528 struct syscall_metadata *meta = call->data;
529 unsigned long mask;
530 char *arg;
531 int offset = offsetof(typeof(trace), args);
532 int ret = 0;
533 int len;
534 int i;
535
536 for (i = 0; i < meta->nb_args; i++) {
537 ret = trace_define_field(call, meta->types[i],
538 meta->args[i], offset,
539 sizeof(unsigned long), 0,
540 FILTER_OTHER);
541 if (ret)
542 break;
543 offset += sizeof(unsigned long);
544 }
545
546 if (ret || !meta->user_mask)
547 return ret;
548
549 mask = meta->user_mask;
550
551 while (mask) {
552 int idx = ffs(mask) - 1;
553 mask &= ~BIT(idx);
554
555 /*
556 * User space data is faulted into a temporary buffer and then
557 * added as a dynamic string or array to the end of the event.
558 * The user space data name for the arg pointer is
559 * "__<arg>_val".
560 */
561 len = strlen(meta->args[idx]) + sizeof("___val");
562 arg = kmalloc(len, GFP_KERNEL);
563 if (WARN_ON_ONCE(!arg)) {
564 meta->user_mask = 0;
565 return -ENOMEM;
566 }
567
568 snprintf(arg, len, "__%s_val", meta->args[idx]);
569
570 ret = trace_define_field(call, "__data_loc char[]",
571 arg, offset, sizeof(int), 0,
572 FILTER_OTHER);
573 if (ret) {
574 kfree(arg);
575 break;
576 }
577 offset += 4;
578 }
579 return ret;
580 }
581
582 /*
583 * Create a per CPU temporary buffer to copy user space pointers into.
584 *
585 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
586 * (defined in kernel/trace/trace.h)
587
588 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
589 * nul terminating byte and possibly appended EXTRA (4 bytes).
590 *
591 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
592 * to copy memory from user space addresses into that will hold
593 * 3 args as only 3 args are allowed to be copied from system calls.
594 */
595 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
596 #define SYSCALL_FAULT_MAX_CNT 3
597 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
598
599 /* Use the tracing per CPU buffer infrastructure to copy from user space */
600 struct syscall_user_buffer {
601 struct trace_user_buf_info buf;
602 struct rcu_head rcu;
603 };
604
605 static struct syscall_user_buffer *syscall_buffer;
606
syscall_fault_buffer_enable(void)607 static int syscall_fault_buffer_enable(void)
608 {
609 struct syscall_user_buffer *sbuf;
610 int ret;
611
612 lockdep_assert_held(&syscall_trace_lock);
613
614 if (syscall_buffer) {
615 trace_user_fault_get(&syscall_buffer->buf);
616 return 0;
617 }
618
619 sbuf = kmalloc_obj(*sbuf);
620 if (!sbuf)
621 return -ENOMEM;
622
623 ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
624 if (ret < 0) {
625 kfree(sbuf);
626 return ret;
627 }
628
629 WRITE_ONCE(syscall_buffer, sbuf);
630
631 return 0;
632 }
633
rcu_free_syscall_buffer(struct rcu_head * rcu)634 static void rcu_free_syscall_buffer(struct rcu_head *rcu)
635 {
636 struct syscall_user_buffer *sbuf =
637 container_of(rcu, struct syscall_user_buffer, rcu);
638
639 trace_user_fault_destroy(&sbuf->buf);
640 kfree(sbuf);
641 }
642
643
syscall_fault_buffer_disable(void)644 static void syscall_fault_buffer_disable(void)
645 {
646 struct syscall_user_buffer *sbuf = syscall_buffer;
647
648 lockdep_assert_held(&syscall_trace_lock);
649
650 if (trace_user_fault_put(&sbuf->buf))
651 return;
652
653 WRITE_ONCE(syscall_buffer, NULL);
654 call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
655 }
656
657 struct syscall_args {
658 char *ptr_array[SYSCALL_FAULT_MAX_CNT];
659 int read[SYSCALL_FAULT_MAX_CNT];
660 int uargs;
661 };
662
syscall_copy_user(char * buf,const char __user * ptr,size_t size,void * data)663 static int syscall_copy_user(char *buf, const char __user *ptr,
664 size_t size, void *data)
665 {
666 struct syscall_args *args = data;
667 int ret;
668
669 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
670 ptr = (char __user *)args->ptr_array[i];
671 ret = strncpy_from_user(buf, ptr, size);
672 args->read[i] = ret;
673 }
674 return 0;
675 }
676
syscall_copy_user_array(char * buf,const char __user * ptr,size_t size,void * data)677 static int syscall_copy_user_array(char *buf, const char __user *ptr,
678 size_t size, void *data)
679 {
680 struct syscall_args *args = data;
681 int ret;
682
683 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
684 ptr = (char __user *)args->ptr_array[i];
685 ret = __copy_from_user(buf, ptr, size);
686 args->read[i] = ret ? -1 : size;
687 }
688 return 0;
689 }
690
sys_fault_user(unsigned int buf_size,struct syscall_metadata * sys_data,struct syscall_user_buffer * sbuf,unsigned long * args,unsigned int data_size[SYSCALL_FAULT_MAX_CNT])691 static char *sys_fault_user(unsigned int buf_size,
692 struct syscall_metadata *sys_data,
693 struct syscall_user_buffer *sbuf,
694 unsigned long *args,
695 unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
696 {
697 trace_user_buf_copy syscall_copy = syscall_copy_user;
698 unsigned long mask = sys_data->user_mask;
699 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
700 struct syscall_args sargs;
701 bool array = false;
702 char *buffer;
703 char *buf;
704 int ret;
705 int i = 0;
706
707 /* The extra is appended to the user data in the buffer */
708 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
709 SYSCALL_FAULT_ARG_SZ);
710
711 /*
712 * If this system call event has a size argument, use
713 * it to define how much of user space memory to read,
714 * and read it as an array and not a string.
715 */
716 if (sys_data->user_arg_size >= 0) {
717 array = true;
718 size = args[sys_data->user_arg_size];
719 if (size > SYSCALL_FAULT_ARG_SZ - 1)
720 size = SYSCALL_FAULT_ARG_SZ - 1;
721 syscall_copy = syscall_copy_user_array;
722 }
723
724 while (mask) {
725 int idx = ffs(mask) - 1;
726 mask &= ~BIT(idx);
727
728 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
729 break;
730
731 /* Get the pointer to user space memory to read */
732 sargs.ptr_array[i++] = (char *)args[idx];
733 }
734
735 sargs.uargs = i;
736
737 /* Clear the values that are not used */
738 for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
739 data_size[i] = -1; /* Denotes no pointer */
740 }
741
742 /* A zero size means do not even try */
743 if (!buf_size)
744 return NULL;
745
746 buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
747 syscall_copy, &sargs);
748 if (!buffer)
749 return NULL;
750
751 buf = buffer;
752 for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
753
754 ret = sargs.read[i];
755 if (ret < 0)
756 continue;
757 buf[ret] = '\0';
758
759 /* For strings, replace any non-printable characters with '.' */
760 if (!array) {
761 for (int x = 0; x < ret; x++) {
762 if (!isprint(buf[x]))
763 buf[x] = '.';
764 }
765
766 size = min(buf_size, SYSCALL_FAULT_USER_MAX);
767
768 /*
769 * If the text was truncated due to our max limit,
770 * add "..." to the string.
771 */
772 if (ret > size) {
773 strscpy(buf + size, EXTRA, sizeof(EXTRA));
774 ret = size + sizeof(EXTRA);
775 } else {
776 buf[ret++] = '\0';
777 }
778 } else {
779 ret = min((unsigned int)ret, buf_size);
780 }
781 data_size[i] = ret;
782 }
783
784 return buffer;
785 }
786
787 static int
syscall_get_data(struct syscall_metadata * sys_data,unsigned long * args,char ** buffer,int * size,int * user_sizes,int * uargs,int buf_size)788 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
789 char **buffer, int *size, int *user_sizes, int *uargs,
790 int buf_size)
791 {
792 struct syscall_user_buffer *sbuf;
793 int i;
794
795 /* If the syscall_buffer is NULL, tracing is being shutdown */
796 sbuf = READ_ONCE(syscall_buffer);
797 if (!sbuf)
798 return -1;
799
800 *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
801 /*
802 * user_size is the amount of data to append.
803 * Need to add 4 for the meta field that points to
804 * the user memory at the end of the event and also
805 * stores its size.
806 */
807 for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
808 if (user_sizes[i] < 0)
809 break;
810 *size += user_sizes[i] + 4;
811 }
812 /* Save the number of user read arguments of this syscall */
813 *uargs = i;
814 return 0;
815 }
816
syscall_put_data(struct syscall_metadata * sys_data,struct syscall_trace_enter * entry,char * buffer,int size,int * user_sizes,int uargs)817 static void syscall_put_data(struct syscall_metadata *sys_data,
818 struct syscall_trace_enter *entry,
819 char *buffer, int size, int *user_sizes, int uargs)
820 {
821 char *buf = buffer;
822 void *ptr;
823 int val;
824
825 /*
826 * Set the pointer to point to the meta data of the event
827 * that has information about the stored user space memory.
828 */
829 ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
830
831 /*
832 * The meta data will store the offset of the user data from
833 * the beginning of the event. That is after the static arguments
834 * and the meta data fields.
835 */
836 val = (ptr - (void *)entry) + 4 * uargs;
837
838 for (int i = 0; i < uargs; i++) {
839
840 if (i)
841 val += user_sizes[i - 1];
842
843 /* Store the offset and the size into the meta data */
844 *(int *)ptr = val | (user_sizes[i] << 16);
845
846 /* Skip the meta data */
847 ptr += 4;
848 }
849
850 for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
851 /* Nothing to do if the user space was empty or faulted */
852 if (!user_sizes[i])
853 continue;
854
855 memcpy(ptr, buf, user_sizes[i]);
856 ptr += user_sizes[i];
857 }
858 }
859
ftrace_syscall_enter(void * data,struct pt_regs * regs,long id)860 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
861 {
862 struct trace_array *tr = data;
863 struct trace_event_file *trace_file;
864 struct syscall_trace_enter *entry;
865 struct syscall_metadata *sys_data;
866 struct trace_event_buffer fbuffer;
867 unsigned long args[6];
868 char *user_ptr;
869 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
870 int syscall_nr;
871 int size = 0;
872 int uargs = 0;
873 bool mayfault;
874
875 /*
876 * Syscall probe called with preemption enabled, but the ring
877 * buffer and per-cpu data require preemption to be disabled.
878 */
879 might_fault();
880
881 syscall_nr = trace_get_syscall_nr(current, regs);
882 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
883 return;
884
885 trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
886 if (!trace_file)
887 return;
888
889 if (trace_trigger_soft_disabled(trace_file))
890 return;
891
892 sys_data = syscall_nr_to_meta(syscall_nr);
893 if (!sys_data)
894 return;
895
896 /* Check if this syscall event faults in user space memory */
897 mayfault = sys_data->user_mask != 0;
898
899 guard(preempt_notrace)();
900
901 syscall_get_arguments(current, regs, args);
902
903 if (mayfault) {
904 if (syscall_get_data(sys_data, args, &user_ptr,
905 &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
906 return;
907 }
908
909 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
910
911 entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
912 if (!entry)
913 return;
914
915 entry = ring_buffer_event_data(fbuffer.event);
916 entry->nr = syscall_nr;
917
918 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
919
920 if (mayfault)
921 syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
922
923 trace_event_buffer_commit(&fbuffer);
924 }
925
ftrace_syscall_exit(void * data,struct pt_regs * regs,long ret)926 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
927 {
928 struct trace_array *tr = data;
929 struct trace_event_file *trace_file;
930 struct syscall_trace_exit *entry;
931 struct syscall_metadata *sys_data;
932 struct trace_event_buffer fbuffer;
933 int syscall_nr;
934
935 /*
936 * Syscall probe called with preemption enabled, but the ring
937 * buffer and per-cpu data require preemption to be disabled.
938 */
939 might_fault();
940 guard(preempt_notrace)();
941
942 syscall_nr = trace_get_syscall_nr(current, regs);
943 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
944 return;
945
946 trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
947 if (!trace_file)
948 return;
949
950 if (trace_trigger_soft_disabled(trace_file))
951 return;
952
953 sys_data = syscall_nr_to_meta(syscall_nr);
954 if (!sys_data)
955 return;
956
957 entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
958 if (!entry)
959 return;
960
961 entry = ring_buffer_event_data(fbuffer.event);
962 entry->nr = syscall_nr;
963 entry->ret = syscall_get_return_value(current, regs);
964
965 trace_event_buffer_commit(&fbuffer);
966 }
967
reg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)968 static int reg_event_syscall_enter(struct trace_event_file *file,
969 struct trace_event_call *call)
970 {
971 struct syscall_metadata *sys_data = call->data;
972 struct trace_array *tr = file->tr;
973 int ret = 0;
974 int num;
975
976 num = sys_data->syscall_nr;
977 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
978 return -ENOSYS;
979 guard(mutex)(&syscall_trace_lock);
980 if (sys_data->user_mask) {
981 ret = syscall_fault_buffer_enable();
982 if (ret < 0)
983 return ret;
984 }
985 if (!tr->sys_refcount_enter) {
986 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
987 if (ret < 0) {
988 if (sys_data->user_mask)
989 syscall_fault_buffer_disable();
990 return ret;
991 }
992 }
993 WRITE_ONCE(tr->enter_syscall_files[num], file);
994 tr->sys_refcount_enter++;
995 return 0;
996 }
997
unreg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)998 static void unreg_event_syscall_enter(struct trace_event_file *file,
999 struct trace_event_call *call)
1000 {
1001 struct syscall_metadata *sys_data = call->data;
1002 struct trace_array *tr = file->tr;
1003 int num;
1004
1005 num = sys_data->syscall_nr;
1006 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1007 return;
1008 guard(mutex)(&syscall_trace_lock);
1009 tr->sys_refcount_enter--;
1010 WRITE_ONCE(tr->enter_syscall_files[num], NULL);
1011 if (!tr->sys_refcount_enter)
1012 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
1013 if (sys_data->user_mask)
1014 syscall_fault_buffer_disable();
1015 }
1016
reg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1017 static int reg_event_syscall_exit(struct trace_event_file *file,
1018 struct trace_event_call *call)
1019 {
1020 struct trace_array *tr = file->tr;
1021 int ret = 0;
1022 int num;
1023
1024 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1025 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1026 return -ENOSYS;
1027 mutex_lock(&syscall_trace_lock);
1028 if (!tr->sys_refcount_exit)
1029 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
1030 if (!ret) {
1031 WRITE_ONCE(tr->exit_syscall_files[num], file);
1032 tr->sys_refcount_exit++;
1033 }
1034 mutex_unlock(&syscall_trace_lock);
1035 return ret;
1036 }
1037
unreg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1038 static void unreg_event_syscall_exit(struct trace_event_file *file,
1039 struct trace_event_call *call)
1040 {
1041 struct trace_array *tr = file->tr;
1042 int num;
1043
1044 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1045 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1046 return;
1047 mutex_lock(&syscall_trace_lock);
1048 tr->sys_refcount_exit--;
1049 WRITE_ONCE(tr->exit_syscall_files[num], NULL);
1050 if (!tr->sys_refcount_exit)
1051 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
1052 mutex_unlock(&syscall_trace_lock);
1053 }
1054
1055 /*
1056 * For system calls that reference user space memory that can
1057 * be recorded into the event, set the system call meta data's user_mask
1058 * to the "args" index that points to the user space memory to retrieve.
1059 */
check_faultable_syscall(struct trace_event_call * call,int nr)1060 static void check_faultable_syscall(struct trace_event_call *call, int nr)
1061 {
1062 struct syscall_metadata *sys_data = call->data;
1063 unsigned long mask;
1064
1065 /* Only work on entry */
1066 if (sys_data->enter_event != call)
1067 return;
1068
1069 sys_data->user_arg_size = -1;
1070
1071 switch (nr) {
1072 /* user arg 1 with size arg at 2 */
1073 case __NR_write:
1074 #ifdef __NR_mq_timedsend
1075 case __NR_mq_timedsend:
1076 #endif
1077 case __NR_pwrite64:
1078 sys_data->user_mask = BIT(1);
1079 sys_data->user_arg_size = 2;
1080 break;
1081 /* user arg 0 with size arg at 1 as string */
1082 case __NR_setdomainname:
1083 case __NR_sethostname:
1084 sys_data->user_mask = BIT(0);
1085 sys_data->user_arg_size = 1;
1086 sys_data->user_arg_is_str = 1;
1087 break;
1088 #ifdef __NR_kexec_file_load
1089 /* user arg 4 with size arg at 3 as string */
1090 case __NR_kexec_file_load:
1091 sys_data->user_mask = BIT(4);
1092 sys_data->user_arg_size = 3;
1093 sys_data->user_arg_is_str = 1;
1094 break;
1095 #endif
1096 /* user arg at position 0 */
1097 #ifdef __NR_access
1098 case __NR_access:
1099 #endif
1100 case __NR_acct:
1101 case __NR_chdir:
1102 #ifdef __NR_chown
1103 case __NR_chown:
1104 #endif
1105 #ifdef __NR_chmod
1106 case __NR_chmod:
1107 #endif
1108 case __NR_chroot:
1109 #ifdef __NR_creat
1110 case __NR_creat:
1111 #endif
1112 case __NR_delete_module:
1113 case __NR_execve:
1114 case __NR_fsopen:
1115 #ifdef __NR_lchown
1116 case __NR_lchown:
1117 #endif
1118 #ifdef __NR_open
1119 case __NR_open:
1120 #endif
1121 case __NR_memfd_create:
1122 #ifdef __NR_mkdir
1123 case __NR_mkdir:
1124 #endif
1125 #ifdef __NR_mknod
1126 case __NR_mknod:
1127 #endif
1128 case __NR_mq_open:
1129 case __NR_mq_unlink:
1130 #ifdef __NR_readlink
1131 case __NR_readlink:
1132 #endif
1133 #ifdef __NR_rmdir
1134 case __NR_rmdir:
1135 #endif
1136 case __NR_shmdt:
1137 #ifdef __NR_statfs
1138 case __NR_statfs:
1139 #endif
1140 case __NR_swapon:
1141 case __NR_swapoff:
1142 #ifdef __NR_truncate
1143 case __NR_truncate:
1144 #endif
1145 #ifdef __NR_unlink
1146 case __NR_unlink:
1147 #endif
1148 case __NR_umount2:
1149 #ifdef __NR_utime
1150 case __NR_utime:
1151 #endif
1152 #ifdef __NR_utimes
1153 case __NR_utimes:
1154 #endif
1155 sys_data->user_mask = BIT(0);
1156 break;
1157 /* user arg at position 1 */
1158 case __NR_execveat:
1159 case __NR_faccessat:
1160 case __NR_faccessat2:
1161 case __NR_finit_module:
1162 case __NR_fchmodat:
1163 case __NR_fchmodat2:
1164 case __NR_fchownat:
1165 case __NR_fgetxattr:
1166 case __NR_flistxattr:
1167 case __NR_fsetxattr:
1168 case __NR_fspick:
1169 case __NR_fremovexattr:
1170 #ifdef __NR_futimesat
1171 case __NR_futimesat:
1172 #endif
1173 case __NR_inotify_add_watch:
1174 case __NR_mkdirat:
1175 case __NR_mknodat:
1176 case __NR_mount_setattr:
1177 case __NR_name_to_handle_at:
1178 #ifdef __NR_newfstatat
1179 case __NR_newfstatat:
1180 #endif
1181 case __NR_openat:
1182 case __NR_openat2:
1183 case __NR_open_tree:
1184 case __NR_open_tree_attr:
1185 case __NR_readlinkat:
1186 case __NR_quotactl:
1187 case __NR_syslog:
1188 case __NR_statx:
1189 case __NR_unlinkat:
1190 #ifdef __NR_utimensat
1191 case __NR_utimensat:
1192 #endif
1193 sys_data->user_mask = BIT(1);
1194 break;
1195 /* user arg at position 2 */
1196 case __NR_init_module:
1197 case __NR_fsconfig:
1198 sys_data->user_mask = BIT(2);
1199 break;
1200 /* user arg at position 4 */
1201 case __NR_fanotify_mark:
1202 sys_data->user_mask = BIT(4);
1203 break;
1204 /* 2 user args, 0 and 1 */
1205 case __NR_add_key:
1206 case __NR_getxattr:
1207 case __NR_lgetxattr:
1208 case __NR_lremovexattr:
1209 #ifdef __NR_link
1210 case __NR_link:
1211 #endif
1212 case __NR_listxattr:
1213 case __NR_llistxattr:
1214 case __NR_lsetxattr:
1215 case __NR_pivot_root:
1216 case __NR_removexattr:
1217 #ifdef __NR_rename
1218 case __NR_rename:
1219 #endif
1220 case __NR_request_key:
1221 case __NR_setxattr:
1222 #ifdef __NR_symlink
1223 case __NR_symlink:
1224 #endif
1225 sys_data->user_mask = BIT(0) | BIT(1);
1226 break;
1227 /* 2 user args, 0 and 2 */
1228 case __NR_symlinkat:
1229 sys_data->user_mask = BIT(0) | BIT(2);
1230 break;
1231 /* 2 user args, 1 and 3 */
1232 case __NR_getxattrat:
1233 case __NR_linkat:
1234 case __NR_listxattrat:
1235 case __NR_move_mount:
1236 #ifdef __NR_renameat
1237 case __NR_renameat:
1238 #endif
1239 case __NR_renameat2:
1240 case __NR_removexattrat:
1241 case __NR_setxattrat:
1242 sys_data->user_mask = BIT(1) | BIT(3);
1243 break;
1244 case __NR_mount: /* Just dev_name and dir_name, TODO add type */
1245 sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
1246 break;
1247 default:
1248 sys_data->user_mask = 0;
1249 return;
1250 }
1251
1252 if (sys_data->user_arg_size < 0)
1253 return;
1254
1255 /*
1256 * The user_arg_size can only be used when the system call
1257 * is reading only a single address from user space.
1258 */
1259 mask = sys_data->user_mask;
1260 if (WARN_ON(mask & (mask - 1)))
1261 sys_data->user_arg_size = -1;
1262 }
1263
init_syscall_trace(struct trace_event_call * call)1264 static int __init init_syscall_trace(struct trace_event_call *call)
1265 {
1266 int id;
1267 int num;
1268
1269 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1270 if (num < 0 || num >= NR_syscalls) {
1271 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
1272 ((struct syscall_metadata *)call->data)->name);
1273 return -ENOSYS;
1274 }
1275
1276 check_faultable_syscall(call, num);
1277
1278 if (set_syscall_print_fmt(call) < 0)
1279 return -ENOMEM;
1280
1281 id = trace_event_raw_init(call);
1282
1283 if (id < 0) {
1284 free_syscall_print_fmt(call);
1285 return id;
1286 }
1287
1288 return id;
1289 }
1290
1291 static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
1292 SYSCALL_FIELD(int, __syscall_nr),
1293 { .type = TRACE_FUNCTION_TYPE,
1294 .define_fields = syscall_enter_define_fields },
1295 {}
1296 };
1297
1298 struct trace_event_functions enter_syscall_print_funcs = {
1299 .trace = print_syscall_enter,
1300 };
1301
1302 struct trace_event_functions exit_syscall_print_funcs = {
1303 .trace = print_syscall_exit,
1304 };
1305
1306 struct trace_event_class __refdata event_class_syscall_enter = {
1307 .system = "syscalls",
1308 .reg = syscall_enter_register,
1309 .fields_array = syscall_enter_fields_array,
1310 .get_fields = syscall_get_enter_fields,
1311 .raw_init = init_syscall_trace,
1312 };
1313
1314 struct trace_event_class __refdata event_class_syscall_exit = {
1315 .system = "syscalls",
1316 .reg = syscall_exit_register,
1317 .fields_array = (struct trace_event_fields[]){
1318 SYSCALL_FIELD(int, __syscall_nr),
1319 SYSCALL_FIELD(long, ret),
1320 {}
1321 },
1322 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
1323 .raw_init = init_syscall_trace,
1324 };
1325
arch_syscall_addr(int nr)1326 unsigned long __init __weak arch_syscall_addr(int nr)
1327 {
1328 return (unsigned long)sys_call_table[nr];
1329 }
1330
init_ftrace_syscalls(void)1331 void __init init_ftrace_syscalls(void)
1332 {
1333 struct syscall_metadata *meta;
1334 unsigned long addr;
1335 int i;
1336 void *ret;
1337
1338 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1339 syscalls_metadata = kzalloc_objs(*syscalls_metadata,
1340 NR_syscalls);
1341 if (!syscalls_metadata) {
1342 WARN_ON(1);
1343 return;
1344 }
1345 }
1346
1347 for (i = 0; i < NR_syscalls; i++) {
1348 addr = arch_syscall_addr(i);
1349 meta = find_syscall_meta(addr);
1350 if (!meta)
1351 continue;
1352
1353 meta->syscall_nr = i;
1354
1355 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1356 syscalls_metadata[i] = meta;
1357 } else {
1358 ret = xa_store(&syscalls_metadata_sparse, i, meta,
1359 GFP_KERNEL);
1360 WARN(xa_is_err(ret),
1361 "Syscall memory allocation failed\n");
1362 }
1363
1364 }
1365 }
1366
1367 #ifdef CONFIG_PERF_EVENTS
1368
1369 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
1370 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
1371 static int sys_perf_refcount_enter;
1372 static int sys_perf_refcount_exit;
1373
perf_call_bpf_enter(struct trace_event_call * call,struct pt_regs * regs,struct syscall_metadata * sys_data,struct syscall_trace_enter * rec)1374 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
1375 struct syscall_metadata *sys_data,
1376 struct syscall_trace_enter *rec)
1377 {
1378 struct syscall_tp_t {
1379 struct trace_entry ent;
1380 int syscall_nr;
1381 unsigned long args[SYSCALL_DEFINE_MAXARGS];
1382 } __aligned(8) param;
1383 int i;
1384
1385 BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
1386
1387 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
1388 perf_fetch_caller_regs(regs);
1389 *(struct pt_regs **)¶m = regs;
1390 param.syscall_nr = rec->nr;
1391 for (i = 0; i < sys_data->nb_args; i++)
1392 param.args[i] = rec->args[i];
1393 return trace_call_bpf(call, ¶m);
1394 }
1395
perf_syscall_enter(void * ignore,struct pt_regs * regs,long id)1396 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
1397 {
1398 struct syscall_metadata *sys_data;
1399 struct syscall_trace_enter *rec;
1400 struct pt_regs *fake_regs;
1401 struct hlist_head *head;
1402 unsigned long args[6];
1403 bool valid_prog_array;
1404 bool mayfault;
1405 char *user_ptr;
1406 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
1407 int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
1408 int syscall_nr;
1409 int rctx;
1410 int size = 0;
1411 int uargs = 0;
1412
1413 /*
1414 * Syscall probe called with preemption enabled, but the ring
1415 * buffer and per-cpu data require preemption to be disabled.
1416 */
1417 might_fault();
1418 guard(preempt_notrace)();
1419
1420 syscall_nr = trace_get_syscall_nr(current, regs);
1421 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1422 return;
1423 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
1424 return;
1425
1426 sys_data = syscall_nr_to_meta(syscall_nr);
1427 if (!sys_data)
1428 return;
1429
1430 syscall_get_arguments(current, regs, args);
1431
1432 /* Check if this syscall event faults in user space memory */
1433 mayfault = sys_data->user_mask != 0;
1434
1435 if (mayfault) {
1436 if (syscall_get_data(sys_data, args, &user_ptr,
1437 &size, user_sizes, &uargs, buf_size) < 0)
1438 return;
1439 }
1440
1441 head = this_cpu_ptr(sys_data->enter_event->perf_events);
1442 valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
1443 if (!valid_prog_array && hlist_empty(head))
1444 return;
1445
1446 /* get the size after alignment with the u32 buffer size field */
1447 size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
1448 size = ALIGN(size + sizeof(u32), sizeof(u64));
1449 size -= sizeof(u32);
1450
1451 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1452 if (!rec)
1453 return;
1454
1455 rec->nr = syscall_nr;
1456 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
1457
1458 if (mayfault)
1459 syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
1460
1461 if ((valid_prog_array &&
1462 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
1463 hlist_empty(head)) {
1464 perf_swevent_put_recursion_context(rctx);
1465 return;
1466 }
1467
1468 perf_trace_buf_submit(rec, size, rctx,
1469 sys_data->enter_event->event.type, 1, regs,
1470 head, NULL);
1471 }
1472
perf_sysenter_enable(struct trace_event_call * call)1473 static int perf_sysenter_enable(struct trace_event_call *call)
1474 {
1475 struct syscall_metadata *sys_data = call->data;
1476 int num;
1477 int ret;
1478
1479 num = sys_data->syscall_nr;
1480
1481 guard(mutex)(&syscall_trace_lock);
1482 if (sys_data->user_mask) {
1483 ret = syscall_fault_buffer_enable();
1484 if (ret < 0)
1485 return ret;
1486 }
1487 if (!sys_perf_refcount_enter) {
1488 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
1489 if (ret) {
1490 pr_info("event trace: Could not activate syscall entry trace point");
1491 if (sys_data->user_mask)
1492 syscall_fault_buffer_disable();
1493 return ret;
1494 }
1495 }
1496 set_bit(num, enabled_perf_enter_syscalls);
1497 sys_perf_refcount_enter++;
1498 return 0;
1499 }
1500
perf_sysenter_disable(struct trace_event_call * call)1501 static void perf_sysenter_disable(struct trace_event_call *call)
1502 {
1503 struct syscall_metadata *sys_data = call->data;
1504 int num;
1505
1506 num = sys_data->syscall_nr;
1507
1508 guard(mutex)(&syscall_trace_lock);
1509 sys_perf_refcount_enter--;
1510 clear_bit(num, enabled_perf_enter_syscalls);
1511 if (!sys_perf_refcount_enter)
1512 unregister_trace_sys_enter(perf_syscall_enter, NULL);
1513 if (sys_data->user_mask)
1514 syscall_fault_buffer_disable();
1515 }
1516
perf_call_bpf_exit(struct trace_event_call * call,struct pt_regs * regs,struct syscall_trace_exit * rec)1517 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
1518 struct syscall_trace_exit *rec)
1519 {
1520 struct syscall_tp_t {
1521 struct trace_entry ent;
1522 int syscall_nr;
1523 unsigned long ret;
1524 } __aligned(8) param;
1525
1526 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
1527 perf_fetch_caller_regs(regs);
1528 *(struct pt_regs **)¶m = regs;
1529 param.syscall_nr = rec->nr;
1530 param.ret = rec->ret;
1531 return trace_call_bpf(call, ¶m);
1532 }
1533
perf_syscall_exit(void * ignore,struct pt_regs * regs,long ret)1534 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
1535 {
1536 struct syscall_metadata *sys_data;
1537 struct syscall_trace_exit *rec;
1538 struct pt_regs *fake_regs;
1539 struct hlist_head *head;
1540 bool valid_prog_array;
1541 int syscall_nr;
1542 int rctx;
1543 int size;
1544
1545 /*
1546 * Syscall probe called with preemption enabled, but the ring
1547 * buffer and per-cpu data require preemption to be disabled.
1548 */
1549 might_fault();
1550 guard(preempt_notrace)();
1551
1552 syscall_nr = trace_get_syscall_nr(current, regs);
1553 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1554 return;
1555 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
1556 return;
1557
1558 sys_data = syscall_nr_to_meta(syscall_nr);
1559 if (!sys_data)
1560 return;
1561
1562 head = this_cpu_ptr(sys_data->exit_event->perf_events);
1563 valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
1564 if (!valid_prog_array && hlist_empty(head))
1565 return;
1566
1567 /* We can probably do that at build time */
1568 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
1569 size -= sizeof(u32);
1570
1571 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1572 if (!rec)
1573 return;
1574
1575 rec->nr = syscall_nr;
1576 rec->ret = syscall_get_return_value(current, regs);
1577
1578 if ((valid_prog_array &&
1579 !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
1580 hlist_empty(head)) {
1581 perf_swevent_put_recursion_context(rctx);
1582 return;
1583 }
1584
1585 perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1586 1, regs, head, NULL);
1587 }
1588
perf_sysexit_enable(struct trace_event_call * call)1589 static int perf_sysexit_enable(struct trace_event_call *call)
1590 {
1591 int num;
1592
1593 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1594
1595 guard(mutex)(&syscall_trace_lock);
1596 if (!sys_perf_refcount_exit) {
1597 int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
1598 if (ret) {
1599 pr_info("event trace: Could not activate syscall exit trace point");
1600 return ret;
1601 }
1602 }
1603 set_bit(num, enabled_perf_exit_syscalls);
1604 sys_perf_refcount_exit++;
1605 return 0;
1606 }
1607
perf_sysexit_disable(struct trace_event_call * call)1608 static void perf_sysexit_disable(struct trace_event_call *call)
1609 {
1610 int num;
1611
1612 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1613
1614 guard(mutex)(&syscall_trace_lock);
1615 sys_perf_refcount_exit--;
1616 clear_bit(num, enabled_perf_exit_syscalls);
1617 if (!sys_perf_refcount_exit)
1618 unregister_trace_sys_exit(perf_syscall_exit, NULL);
1619 }
1620
1621 #endif /* CONFIG_PERF_EVENTS */
1622
syscall_enter_register(struct trace_event_call * event,enum trace_reg type,void * data)1623 static int syscall_enter_register(struct trace_event_call *event,
1624 enum trace_reg type, void *data)
1625 {
1626 struct trace_event_file *file = data;
1627
1628 switch (type) {
1629 case TRACE_REG_REGISTER:
1630 return reg_event_syscall_enter(file, event);
1631 case TRACE_REG_UNREGISTER:
1632 unreg_event_syscall_enter(file, event);
1633 return 0;
1634
1635 #ifdef CONFIG_PERF_EVENTS
1636 case TRACE_REG_PERF_REGISTER:
1637 return perf_sysenter_enable(event);
1638 case TRACE_REG_PERF_UNREGISTER:
1639 perf_sysenter_disable(event);
1640 return 0;
1641 case TRACE_REG_PERF_OPEN:
1642 case TRACE_REG_PERF_CLOSE:
1643 case TRACE_REG_PERF_ADD:
1644 case TRACE_REG_PERF_DEL:
1645 return 0;
1646 #endif
1647 }
1648 return 0;
1649 }
1650
syscall_exit_register(struct trace_event_call * event,enum trace_reg type,void * data)1651 static int syscall_exit_register(struct trace_event_call *event,
1652 enum trace_reg type, void *data)
1653 {
1654 struct trace_event_file *file = data;
1655
1656 switch (type) {
1657 case TRACE_REG_REGISTER:
1658 return reg_event_syscall_exit(file, event);
1659 case TRACE_REG_UNREGISTER:
1660 unreg_event_syscall_exit(file, event);
1661 return 0;
1662
1663 #ifdef CONFIG_PERF_EVENTS
1664 case TRACE_REG_PERF_REGISTER:
1665 return perf_sysexit_enable(event);
1666 case TRACE_REG_PERF_UNREGISTER:
1667 perf_sysexit_disable(event);
1668 return 0;
1669 case TRACE_REG_PERF_OPEN:
1670 case TRACE_REG_PERF_CLOSE:
1671 case TRACE_REG_PERF_ADD:
1672 case TRACE_REG_PERF_DEL:
1673 return 0;
1674 #endif
1675 }
1676 return 0;
1677 }
1678