name : syscall_common.h
/**
@file
@brief    Common structures and macroses for system call hooks
@details  Copyright (c) 2017-2022 Acronis International GmbH
@author   Mikhail Molchanov ([email protected])
@since    $Id: $
*/

#pragma once

#include "module_rundown_protection.h"
#include "stringify.h"
#include "syscall_utils.h"

#include <linux/types.h>	// bool

/*
 * ============================================================================
 *
 * Here is the short summary about different types of compatibility syscalls (
 * based mostly on lwn.net "Anatomy of a system call, part 1" and "..., part 2":
 * https://lwn.net/Articles/604287/ and https://lwn.net/Articles/604515/,
 * respectively):
 *
 * ----------------------------------------------------------------------------
 *
 * First, you need to know that there are userspace and kernelspace components
 * each of which could either have 32 or 64-bit bitness:
 *
 * 1) 32-bit kernel supports only 32-bit userspace programs.
 * 2) 64-bit kernel supports:
 * 2.a) 64-bit userspace programs (x86_64)
 * 2.b) 32-bit userspace programs (x86_32)
 * 2.c) x32 ABI userspace programs (as described on Wikipedia and "The x32
 *      system call ABI" lwn.net's article (https://en.wikipedia.org/wiki/X32_ABI
 *      and https://lwn.net/Articles/456731/, respectively):
 *      x32 ABI allows programs to take advantage of the benefits of x86_64
 *      instruction set (larger number of CPU registers, function parameters
 *      passed via registers, ...) while using 32-bit pointers and thus avoiding
 *      the overhead of 64-bit pointers (mostly expanded memory use and a larger
 *      cache footprint))
 *
 * ----------------------------------------------------------------------------
 *
 * Next, take a look at "arch/x86/syscalls/syscall_32.tbl" and
 * "arch/x86/syscalls/syscall_64.tbl" - these are files (from which syscalls
 * tables are generated during the compilation of the Linux kernel) that contain
 * the lists of every syscall in the OS, their respective numbers, names of
 * functions implementing them for different bitnesses of user/kernel spaces and
 * some more.
 *
 * -- File "syscall_32.tbl" is present on both 32-bit and 64-bit kernel and has
 * the format:
 * <number> <abi> <name> <entry point> <compat entry point>
 *
 * Where:
 * <number>             - system-wide unique syscall number (each has a handy
 *                        define, e.g. 'pwritev' has a number 334 and a symbolic
 *                        define for this number: '__NR_sys_pwritev')
 *
 * <name>               - the actual name of the syscall (e.g. 'pwritev')
 *
 * <entry point>        - the name for a function implementing given syscall for
 *                        the (1) case (as described before: 32-bit userspace
 *                        program running on 32-bit kernel);
 *                        NOTE: search implementation as
 *                              "SYSCALL_DEFINE.*(pwritev"
 *
 * <compat entry point> - the name for a function implementing given syscall for
 *                        the (2.b) case (32-bit user on 64-bit kernel);
 *                        NOTE: search implementation as
 *                              "COMPAT_SYSCALL_DEFINE.*(pwritev"
 *
 * -- File "syscall_64.tbl" is present only on 64-bit kernel and has the format:
 * <number> <abi> <name> <entry point>
 *
 * Where everything is similar, but:
 * <number> - from 0 to 511 - only (2.a) syscalls (64-bit user on 64-bit kernel)
 *                            similar to <entry point> column of
 *                            "syscall_32.tbl"
 *          - from 512      - only (2.c) syscalls (x32 ABI)
 *                            similar to <compat entry point> column of
 *                            "syscall_32.tbl"
 *
 * ----------------------------------------------------------------------------
 *
 * Let's examine, for example, 'pwritev()' syscall:
 *
 * -- "syscall_32.tbl" has following string for it:
 * <number>	<abi>	<name>		<entry point>	<compat entry point>
 * 334		i386	pwritev		sys_pwritev	compat_sys_pwritev
 *
 * -- "syscall_64.tbl" instead has two of them:
 * <number>	<abi>	<name>		<entry point>
 * 296		64	pwritev		sys_pwritev
 * ...
 * 535		x32	pwritev		compat_sys_pwritev64
 *
 * 2.a) For the most popular nowadays case (64-bit user, 64-bit kernel) all is
 *      simple: "syscall_64.tbl" on 64-bit kernel has analogous 'sys_call_table'
 *      in memory, so to call (2.a) 'pwritev' you need to call 'sys_pwritev'
 *      from "syscall_64.tbl" or 'sys_call_table[__NR_pwritev]'.
 *      NOTE: include <linux/unistd.h> for macros.
 *
 * 1)   Surprisingly, for this case (32-bit user, 32-bit kernel) all is almost
 *      the same as in the previous case: "syscall_32.tbl" on 32-bit kernel has
 *      analogous 'sys_call_table' in memory, so to call (1) 'pwritev' you need
 *      to call 'sys_pwritev' from "syscall_32.tbl" or 'sys_call_table[__NR_pwritev]'.
 *
 * Now for mixed bitnesses:
 * 2.b) There (32-bit user, 64-bit kernel) you need 'compat_sys_pwritev' from
 *      "syscall_32.tbl", but remember - you have 64-bit kernel, so "syscall_32.tbl"
 *      won't be called 'sys_call_table' in memory (as it's the in-memory name
 *      for the 64-bit syscalls table) but rather 'ia32_sys_call_table' and the
 *      number of 'pwritev' from this "syscall_32.tbl" might be different from
 *      the number of ordinary (2.a) 'pwritev' from "syscall_64.tbl", so to call
 *      (2.b) 'pwritev' you need to call 'compat_sys_pwritev' from "syscall_32.tbl"
 *      or 'ia32_sys_call_table[__NR_ia32_pwritev]'.
 *      NOTE: include <asm/ia32_unistd.h> for macros.
 *
 * 2.c) Lastly, (x32 ABI) you need 'compat_sys_pwritev64' from "syscall_64.tbl";
 *      as before, "syscall_64.tbl" has analogous 'sys_call_table' in memory but
 *      'pwritev' is x32 ABI now, so it'll have its own number in this table
 *      (different from ordinary (2.a) 'pwrite'), because of that, to call
 *      (2.c) 'pwritev' do 'sys_call_table[__NR_x32_pwritev]'.
 *
 *
 * Once again (now with some ASCII art; here 32u = 32-bit userspace,
 *                                           64k = 64-bit kernelspace and so on):
 * -- "syscall_32.tbl":
 * <number>	<abi>	<name>		<entry point>	<compat entry point>
 * 334		i386	pwritev		 sys_pwritev	 compat_sys_pwritev
 *                                            ^                   ^
 *                                            |                   |
 *                                    ((1): 32u on 32k)  ((2.b): 32u on 64k)
 *
 * -- "syscall_64.tbl":
 * <number>	<abi>	<name>		<entry point>
 * 296		64	pwritev		sys_pwritev  <---------- ((2.a): 64u on 64k)
 * ...
 * 535		x32	pwritev		compat_sys_pwritev64  <-- ((2.c): x32 ABI)
 *
 * ----------------------------------------------------------------------------
 *
 * And the last thing:
 * if in "syscall_32.tbl" or "syscall_64.tbl" there is a blank cell in the table
 * instead of a compat syscall name ((2.b): 32u on 64k or (2.c): x32 ABI,
 * respectively) this means that there is no need for dedicated compat syscall
 * and ordinary syscall ((1): 32u on 32k or (2.a): 64u on 64k) is doing both
 * jobs: its and compat's one. So, to call, for example (2.b): 32u on 64k 'creat',
 * as with ordinary (2.b) do 'ia32_sys_call_table[__NR_ia32_creat]' and this
 * will call 'sys_creat' that is doing its and imaginary 'compat_sys_creat's job.
 *
 * There are even syscalls (e.g. ordinary 'read' or 'write') that have single
 * implementation for all four cases: (1): 32u on 32k, (2.a): 64u on 64k,
 * (2.b): 32u on 64k and (2.c): x32 ABI, and there is nothing strange, because
 * we need distinct compat functions only if syscall needs to "cope with
 * pointer-to-pointer arguments or pointer-to-struct-containing-pointer
 * arguments, for example 'struct iovec'".
 *
 * ============================================================================
 */

#ifdef __NR_renameat2
#define RENAMEAT2_ENABLED true
#else
  #define __NR_renameat2	316
#define RENAMEAT2_ENABLED false
#endif

#ifdef __NR_preadv2
#define PREADV2_ENABLED true
#else
  #define __NR_preadv2	327
  #define PREADV2_ENABLED false
#endif

#ifdef __NR_pwritev2
#define PWRITEV2_ENABLED true
#else
  #define __NR_pwritev2	328
#define PWRITEV2_ENABLED false
#endif

/*
 * This is for both (1) and (2.a) cases, depending on the bitness of the kernel.
 */
#define DEFINE_HOOK_DESC(abi, tag, enabled) \
	{ \
		NULL, \
		SYSCALL_HOOK_TRAMPOLINE(abi, tag), \
		STRINGIFY(SYSCALL_HOOK_TRAMPOLINE(abi, tag)), \
		__NR_##tag, \
		enabled \
	}
#define SYSCALL_ORIG(abi, tag) \
	( \
		(SYSCALL_ORIG_TYPE_NAME(abi, tag)) \
		(hook_table[SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)
#define SYSCALL_ORIG_FN(abi, tag) \
	( \
		(long) (hook_table[SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)

/*
 * This is for (2.b) case, but when <compat entry point> cell is empty in "syscall_32.tbl".
 * This means that hook has the same prototype as (2.a) but needs to be called using
 * 'ia32_sys_call_table' just like (2.b).
 */
#define DEFINE_IA32_HOOK_DESC(abi, tag, enabled) \
	{ \
		NULL, \
		SYSCALL_HOOK_TRAMPOLINE(abi, tag), \
		STRINGIFY(SYSCALL_HOOK_TRAMPOLINE(abi, tag)), \
		__NR_ia32_##tag, \
		enabled \
	}
#define IA32_SYSCALL_ORIG(abi, tag) \
	( \
		(SYSCALL_ORIG_TYPE_NAME(abi, tag)) \
		(ia32_hook_table[SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)
#define IA32_SYSCALL_ORIG_FN(abi, tag) \
	( \
		(long) (ia32_hook_table[SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)

/*
 * This is for (2.b) case.
 */
#define DEFINE_COMPAT_HOOK_DESC(abi, tag, enabled) \
	{ \
		NULL, \
		SYSCALL_HOOK_TRAMPOLINE(abi, tag), \
		STRINGIFY(SYSCALL_HOOK_TRAMPOLINE(abi, tag)), \
		__NR_ia32_##tag, \
		enabled \
	}
#define DEFINE_COMPAT_HOOK_DESC_PRE(abi, tag, enabled) \
	{ \
		NULL, \
		SYSCALL_HOOK_NAME_PRE(abi, tag), \
		STRINGIFY(SYSCALL_HOOK_NAME(abi, tag)), \
		__NR_ia32_##tag, \
		enabled \
	}
#define COMPAT_SYSCALL_ORIG(abi, tag) \
	( \
		(SYSCALL_ORIG_TYPE_NAME(abi, tag)) \
		(ia32_hook_table[COMPAT_SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)
#define COMPAT_SYSCALL_ORIG_FN(abi, tag) \
	( \
		(long) (ia32_hook_table[COMPAT_SYSCALL_HOOK_ID(abi, tag)].syscall_orig) \
	)

enum hook_numbs {
	SYSCALL_HOOK_ID(sys, creat),

	SYSCALL_HOOK_ID(sys, open),
	SYSCALL_HOOK_ID(sys, openat),

	SYSCALL_HOOK_ID(sys, close),

	SYSCALL_HOOK_ID(sys, read),
	SYSCALL_HOOK_ID(sys, pread64),
	SYSCALL_HOOK_ID(sys, readv),
	SYSCALL_HOOK_ID(sys, preadv),
	SYSCALL_HOOK_ID(sys, preadv2),

	SYSCALL_HOOK_ID(sys, write),
	SYSCALL_HOOK_ID(sys, pwrite64),
	SYSCALL_HOOK_ID(sys, writev),
	SYSCALL_HOOK_ID(sys, pwritev),
	SYSCALL_HOOK_ID(sys, pwritev2),

	SYSCALL_HOOK_ID(sys, rename),
	SYSCALL_HOOK_ID(sys, renameat),
	SYSCALL_HOOK_ID(sys, renameat2),

	SYSCALL_HOOK_ID(sys, unlink),
	SYSCALL_HOOK_ID(sys, unlinkat),

	TOTAL_HOOKS_COUNT
};

/*
 * There are practically no '__NR_ia32_*' macros on CentOS 6 (besides the ones
 * defined in "/usr/src/kernels/`uname -r`/arch/x86/include/asm/ia32_unistd.h"),
 * so define needed syscall numbers manually.
 *
 * Syscalls are never renumbered (on sane kernels), so get the right numbers from:
 * "/usr/src/kernels/`uname -r`/arch/x86/include/asm/unistd_32.h"
 */
#ifndef __NR_ia32_creat
#define __NR_ia32_creat		  8
#endif

#ifndef __NR_ia32_open
#define __NR_ia32_open		  5
#endif

#ifndef __NR_ia32_close
#define __NR_ia32_close		  6
#endif

#ifndef __NR_ia32_openat
#define __NR_ia32_openat	295
#endif

#ifndef __NR_ia32_readv
#define __NR_ia32_readv		145
#endif

#ifndef __NR_ia32_read
#define __NR_ia32_read		  3
#endif

#ifndef __NR_ia32_pread64
#define __NR_ia32_pread64	180
#endif


#ifndef __NR_ia32_preadv
#define __NR_ia32_preadv 	333
#endif

#ifndef __NR_ia32_preadv2
#define __NR_ia32_preadv2 	378
#endif

#ifndef __NR_ia32_writev
#define __NR_ia32_writev	146
#endif

#ifndef __NR_ia32_pwrite64
#define __NR_ia32_pwrite64	181
#endif

#ifndef __NR_ia32_pwritev
#define __NR_ia32_pwritev	334
#endif

#ifndef __NR_ia32_rename
#define __NR_ia32_rename	 38
#endif

#ifndef __NR_ia32_renameat
#define __NR_ia32_renameat	302
#endif

#ifndef __NR_ia32_unlink
#define __NR_ia32_unlink	 10
#endif

#ifndef __NR_ia32_unlinkat
#define __NR_ia32_unlinkat	301
#endif

#ifndef __NR_ia32_renameat2
#define __NR_ia32_renameat2	353
#endif

#ifndef __NR_ia32_pwritev2
#define __NR_ia32_pwritev2	379
#endif

enum ia32_hook_numbs {
	SYSCALL_HOOK_ID(ia32_sys, creat),

	/*
	 * In fact, it is 'compat_sys_open()', but function prototype is identical
	 * to ordinary 'sys_open()', so to simplify our code we treat it as
	 * 'ia32_sys_open()'.
	 */
	SYSCALL_HOOK_ID(ia32_sys, open),
	SYSCALL_HOOK_ID(ia32_sys, openat),

	SYSCALL_HOOK_ID(ia32_sys, close),

	SYSCALL_HOOK_ID(ia32_sys, read),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, pread64),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, readv),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, preadv),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, preadv2),

	SYSCALL_HOOK_ID(ia32_sys, write),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, pwrite64),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, writev),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, pwritev),
	COMPAT_SYSCALL_HOOK_ID(compat_sys, pwritev2),

	SYSCALL_HOOK_ID(ia32_sys, rename),
	SYSCALL_HOOK_ID(ia32_sys, renameat),
	SYSCALL_HOOK_ID(ia32_sys, renameat2),

	SYSCALL_HOOK_ID(ia32_sys, unlink),
	SYSCALL_HOOK_ID(ia32_sys, unlinkat),

	TOTAL_IA32_HOOKS_COUNT
};

struct hook_desc {
	void *syscall_orig;
	void *syscall_hook;
	const char *syscall_name;
	int syscall_nr;
	/*
	 * If a syscall is present on the system (it depends on kernel version
	 * <=> distro name/version) you need to set bit corresponding to this
	 * distro in 'activation_bitmask' using appropriate macro, thus enabling
	 * this syscall to be inserted upon driver start on this distro.
	 */
  // TODO: This is culprit, refactoring of 'fs_syscall_hooks' is required to remove it
	bool enabled;
};

extern struct hook_desc hook_table[];
extern struct hook_desc ia32_hook_table[];

int syscall_hooks_attach(bool safe_mode);
int syscall_hooks_detach(void);

#define HOOK_PROLOG() mod_rundown_protection_lock()
#define HOOK_EPILOG() mod_rundown_protection_unlock()

© 2025 Cubjrnet7