shell bypass 403

Cubjrnet7 Shell


name : transport.c
/**
@file
@brief    Message transport between kernel and userspace
@details  Copyright (c) 2017-2021 Acronis International GmbH
@author   Mikhail Krivtsov ([email protected])
@since    $Id: $
*/

#include "transport.h"

#include "debug.h"
#include "device.h"
#include "file_contexts.h"
#include "ftrace_hooks/ftrace_events.h"
#include "ftrace_hooks/fsnotify_listener.h"
#include "lsm_common.h"
#include "memory.h"
#include "message.h"
#include "si_fp_properties.h"
#include "syscall_common.h"
#include "task_info_map.h"
#include "tracepoints.h"
#include "transport_protocol.h"

#include <linux/bitmap.h>
#include <asm/io.h>
#include <linux/fcntl.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/list.h>
#include <linux/jiffies.h>	// msecs_to_jiffies()
#include <linux/mutex.h>
#ifndef KERNEL_MOCK
#include <linux/sched.h>
#else
#include <mock/mock_sched.h>
#endif
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>	// copy_from_user(), copy_to_user()
#include <linux/vmalloc.h>
#include <linux/wait.h>		// wait_event*(), wake_up*()

#define TRANSPORT_MSG_SIZE_MAX (1<<10)
// This size is reasonable to a good amount of messages, although it is really a bare minimum
#define TRANSPORT_MINIMAL_SHARED_DATA_QUEUE_SIZE (128 * 1024)
#define TRANSPORT_QUEUE_CAPACITY (0x1000 / sizeof(msg_t *))
#define TRANSPORT_WAIT_REPLY_TIMEOUT_MSECS (60*1000)
#define TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS (5*1000)

#define TRANSPORT_PRINTF(format, args...) DPRINTF(format, ##args)

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#define DATA_QUEUE_HEADER_SIZE sizeof(shared_data_queue_t)
#define DATA_QUEUE_ENTRY_HEADER_SIZE sizeof(data_queue_entry_t)

static transport_event_t* transport_event_new(void)
{
	transport_event_t* event = mem_alloc0(sizeof(transport_event_t));
	if (!event)
		return NULL;

	atomic64_add(1, &g_memory_metrics->total_transport_events);
	atomic_set(&event->refcount, 1);
	init_waitqueue_head(&event->msg_wait_queue);
	return event;
}

static transport_event_t* transport_event_ref(transport_event_t* event)
{
	atomic_inc(&event->refcount);
	return event;
}

static void transport_event_unref(transport_event_t* event)
{
	if (atomic_dec_and_test(&event->refcount)) {
		mem_free(event);
		atomic64_sub(1, &g_memory_metrics->total_transport_events);
	}
}

transport_global_t transport_global;

static void transport_global_init(void)
{
	mutex_init(&transport_global.transport_mutex);
	transport_global.transport_count = 0;
	transport_global.transports = (transports_t) {0};
	atomic64_set(&transport_global.msg_id_sequence, 0);
	transport_global.last_transport_seq_num = 0;
	transport_global.transport_ids = (transport_ids_t) {0};
}

// must be called under 'transport_global.transport_mutex'
static transport_id_t transport_acquire_id(void) {
	transport_id_t transport_id = 0;
	int i;
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		if (0 == READ_ONCE(transport_global.transport_ids.ids[i])) {
			transport_id = transport_id_make(++transport_global.last_transport_seq_num, i);
			WRITE_ONCE(transport_global.transport_ids.ids[i], transport_id);
			break;
		}
	}
	return transport_id;
}

// must be called under 'transport_global.transport_mutex'
static void transport_release_id(transport_id_t id) {
	int i = transport_id_index(id);
	if (id == READ_ONCE(transport_global.transport_ids.ids[i])) {
		WRITE_ONCE(transport_global.transport_ids.ids[i], 0);
	} else {
		WPRINTF("transport id %llu not found", id);
	}
}

// must be called under 'transport_global.transport_mutex'
static void transport_global_register(transport_t *transport)
{
	int idx = transport_id_index(transport->transport_id);
	rcu_assign_pointer(transport_global.transports.transports[idx], transport);
	WRITE_ONCE(transport_global.transports.control_tgid[idx], transport->control_tgid);
}

// must be called under 'transport_global.transport_mutex'
static void transport_global_unregister(transport_t *transport)
{
	int idx = transport_id_index(transport->transport_id);
	rcu_assign_pointer(transport_global.transports.transports[idx], NULL);
	WRITE_ONCE(transport_global.transports.control_tgid[idx], 0);
}

// must be called under 'transport_global.transport_mutex'
static void transport_global_recalculate_combined_event_mask_impl(void)
{
	int i;
	uint64_t combined_mask = 0;
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		transport_t *transport;
		transport_id_t transport_id = READ_ONCE(transport_global.transport_ids.ids[i]);
		if (!transport_id)
			continue;

		// do not need to RCU access this pointer because under 'transport_mutex'
		transport = transport_global.transports.transports[i];
		combined_mask |= READ_ONCE(transport->events_mask);
	}

	WRITE_ONCE(transport_global.combined_events_mask, combined_mask);
}

struct event_subtype_relation_t
{
	uint64_t events_mask;
	uint64_t generated_subtype_mask;
};

static const struct event_subtype_relation_t k_subtypes_relations[] = {
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_OPEN)   , MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_OPEN_MODIFY)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_OPEN_READ)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_OPEN_MAY_CREATE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_OPENDIR) },
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_OPEN) , MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_OPEN_MODIFY)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_OPEN_READ)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_OPEN_MAY_CREATE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_OPENDIR) },
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_CLOSE)  , MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_CLOSE_NON_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_CLOSE_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_CLOSEDIR) },
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_CLOSE), MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_CLOSE_NON_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_CLOSE_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_CLOSEDIR) },
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_MMAP)   , MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_MMAP_NON_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SYNC_MMAP_WRITE) },
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_MMAP) , MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_MMAP_NON_WRITE)
	                                                        | MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_NOTIFY_MMAP_WRITE) },
	// All fs operations relate to generic subtypes like 'special'
	{ MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_OPEN)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_OPEN)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_CREATE)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_RENAME)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_RENAME)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FSNOTIFY_RENAME)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_UNLINK)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FSNOTIFY_UNLINK)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_CLOSE)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_CLOSE)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FSNOTIFY_OPEN)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_RENAME)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_UNLINK)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FSNOTIFY_CREATE)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_MMAP)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_MMAP)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_LINK)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_LINK)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_SYNC_FILE_PRE_TRUNCATE)
	| MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_NOTIFY_FILE_PRE_TRUNCATE), MSG_TYPE_TO_EVENT_MASK(FP_SI_ST_SPECIAL) },
};

// If any transport includes subtype, combined should have the subtype
static void transport_global_recalculate_combined_event_subtype_inclusion_mask_impl(void)
{
	int i;
	uint64_t combined_mask = 0;
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		int j;
		transport_t *transport;
		transport_id_t transport_id = READ_ONCE(transport_global.transport_ids.ids[i]);
		if (!transport_id)
			continue;

		// do not need to RCU access this pointer because under 'transport_mutex'
		transport = transport_global.transports.transports[i];
		for (j = 0; j < (int) ARRAY_SIZE(k_subtypes_relations); j++) {
			const struct event_subtype_relation_t* relation = &k_subtypes_relations[j];
			if (transport->events_mask & relation->events_mask) {
				combined_mask |= transport->events_subtype_inclusion_mask & relation->generated_subtype_mask;
			}
		}
	}

	WRITE_ONCE(transport_global.combined_events_subtype_inclusion_mask, combined_mask);
}

// If all transports excludes subtype, combined excludes subtype
static void transport_global_recalculate_combined_event_subtype_exclusion_mask_impl(void)
{
	int i;
	uint64_t combined_mask = ~0ULL;
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		int j;
		uint64_t transport_not_excluded_subtypes;
		transport_t *transport;
		transport_id_t transport_id = READ_ONCE(transport_global.transport_ids.ids[i]);
		if (!transport_id)
			continue;

		// do not need to RCU access this pointer because under 'transport_mutex'
		transport = transport_global.transports.transports[i];
		transport_not_excluded_subtypes = ~transport->events_subtype_exclusion_mask;
		for (j = 0; j < (int) ARRAY_SIZE(k_subtypes_relations); j++) {
			const struct event_subtype_relation_t* relation = &k_subtypes_relations[j];
			if (transport->events_mask & relation->events_mask) {
				// drop bits from the combined mask if it is known that generated events are not being excluded
				combined_mask &= ~(transport_not_excluded_subtypes & relation->generated_subtype_mask);
			}
		}
	}

	WRITE_ONCE(transport_global.combined_events_subtype_exclusion_mask, combined_mask);
}

static void transport_global_recalculate_combined_all_event_masks_impl(void)
{
	transport_global_recalculate_combined_event_mask_impl();
	transport_global_recalculate_combined_event_subtype_inclusion_mask_impl();
	transport_global_recalculate_combined_event_subtype_exclusion_mask_impl();
}

static void transport_global_recalculate_combined_all_event_masks(void)
{
	mutex_lock(&transport_global.transport_mutex);
	transport_global_recalculate_combined_all_event_masks_impl();
	mutex_unlock(&transport_global.transport_mutex);
}

static void drop_msgs_impl(ring_t *ring)
{
	while (!ring_is_empty(ring)) {
		msg_t *msg = *(msg_t **) ring_consumer_ptr(ring);
		msg_unref(msg);
		ring_consumer_index_move_one(ring);
	}
}

/*
    'msg ref/unref' for messages stored in 'sent_msgs_set' are invoked in
    'msg_reply_wait_count inc/dec'.
    There is no need for separate 'msg ref/unref' calls.
*/
static void drop_sent_msgs_impl(set_t *set)
{
	void *item_ptr = set_begin_ptr(set);
	void *end_ptr = set_end_ptr(set);
	while (item_ptr < end_ptr) {
		msg_t *msg = *(msg_t **) item_ptr;
		msg_reply_wait_count_dec(msg);
		item_ptr = set_ptr_next(set, item_ptr);
	}
	set->count = 0;
}

static void transport_shutdown(transport_t *transport)
{
	DPRINTF("transport=%p", transport);
	spin_lock(&transport->msg_spinlock);
	{
		WRITE_ONCE(transport->events_mask, 0);
		WRITE_ONCE(transport->shutdown, true);

		// Discard undelivered messages
		drop_msgs_impl(&transport->msg_ring);

		// Discard messages waiting for 'reply'
		drop_sent_msgs_impl(&transport->sent_msgs_set);
	}
	spin_unlock(&transport->msg_spinlock);

	// wakeup all userspace 'read' waiters
	wake_up_all(&transport->event->msg_wait_queue);
}

static const char* transport_name(transport_t* transport)
{
	client_type_t client_type = transport->client_type;
	switch (client_type)
	{
		case CLIENT_UNKNOWN:
			return "?";
		case CLIENT_TEST:
			return "test";
		case CLIENT_AP:
			return "ap";
		case CLIENT_BE:
			return "be";
		case CLIENT_RTP:
			return "rtp";
	}

	return "?";
}

#define TRANSPORT_FMT "%s[%d]"
#define TRANSPORT_PRINT(transport) transport_name(transport), transport->client_type

// identify and shutdown transport failed to reply
static void transport_shutdown_msg(transport_t *transport, msg_t *unreplied_msg)
{
	bool found = false;

	DPRINTF("transport=%p unreplied_msg=%p", transport, unreplied_msg);

	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			if (unreplied_msg == *(msg_t **) item_ptr) {
				found = true;
				break;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
	}
	spin_unlock(&transport->msg_spinlock);

	if (found) {
		WPRINTF("deactivating transport " TRANSPORT_FMT " on reply wait timeout", TRANSPORT_PRINT(transport));
		transport_shutdown(transport);
	}
}

// identify and shutdown transport failed to reply
static void transport_global_shutdown_msg(msg_t *unreplied_msg)
{
	int i;

	DPRINTF("unreplied_msg=%p", unreplied_msg);
	rcu_read_lock();
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		transport_t *transport = rcu_dereference(transport_global.transports.transports[i]);
		if (!transport)
			continue;

		transport_shutdown_msg(transport, unreplied_msg);
	}
	rcu_read_unlock();
}

static void transport_disable(transport_t *transport)
{
	DPRINTF("transport=%p", transport);
	transport_global_unregister(transport);
	transport_shutdown(transport);

	transport_release_id(transport->transport_id);
	release_file_context_entry(transport->transport_id);
}

static void transport_free(transport_t *transport)
{
	DPRINTF("transport=%p", transport);
	IPRINTF("message queue items_count_max=%u capacity=%u",
			ring_items_count_max(&transport->msg_ring),
			ring_capacity(&transport->msg_ring));
	IPRINTF("sent_msgs_set items_count_max=%u capacity=%u",
			set_items_count_max(&transport->sent_msgs_set),
			set_fetch_capacity(&transport->sent_msgs_set));

	mem_free(ring_buffer(&transport->msg_ring));
	mem_free(set_buffer(&transport->sent_msgs_set));

	if (transport->queue) {
		vfree(transport->queue);
	}

	transport_event_unref(transport->event);

	mem_free(transport);
	atomic64_sub(1, &g_memory_metrics->total_transports);
}

struct transport_attribute {
	struct attribute attr;
	ssize_t (*show)(transport_t* transport, char *buf);
};

static ssize_t bytes_written_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%lu\n", (long) transport->bytes_written); }

static ssize_t queue_size_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%u\n", transport->queue_size); }

static ssize_t insert_filled_size_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%u\n", transport->insert_filled_size); }

static ssize_t insert_filled_size_max_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%u\n", transport->insert_filled_size_max); }

static ssize_t wait_set_count_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%u\n", transport->sent_msgs_set.count); }

static ssize_t wait_set_count_max_show(transport_t* transport, char *buf)
{ return sprintf(buf, "%u\n", transport->sent_msgs_set.count_max); }

#define TRANSPORT_METRIC_X \
TRANSPORT_METRIC(bytes_written) \
TRANSPORT_METRIC(queue_size) \
TRANSPORT_METRIC(insert_filled_size) \
TRANSPORT_METRIC(insert_filled_size_max) \
TRANSPORT_METRIC(wait_set_count) \
TRANSPORT_METRIC(wait_set_count_max)

#define TRANSPORT_METRIC(_name) static const struct transport_attribute s_transport_attr_##_name = __ATTR_RO(_name);
TRANSPORT_METRIC_X
#undef TRANSPORT_METRIC

static const struct attribute *s_transport_attrs[] = {
#define TRANSPORT_METRIC(_name) &s_transport_attr_##_name.attr,
TRANSPORT_METRIC_X
#undef TRANSPORT_METRIC
	NULL
};

#define to_transport(_at) container_of(to_safe_kobject(_at), transport_t, skobj)
#define to_transport_attr(_at) container_of(_at, struct transport_attribute, attr)

static ssize_t transport_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
	transport_t* transport = to_transport(kobj);
	struct transport_attribute *transport_attr = to_transport_attr(attr);
	return transport_attr->show(transport, buf);
}

const struct sysfs_ops s_transport_sysfs_ops = {
	.show = transport_show,
};

static struct kobj_type s_transport_ktype = {
	.release = safe_kobject_sysfs_release,
	.sysfs_ops = &s_transport_sysfs_ops,
};

static void transport_del(transport_t *transport)
{
	sysfs_remove_files(&transport->skobj.kobj, s_transport_attrs);
	safe_kobject_del(&transport->skobj);
	transport_free(transport);
}

static bool transport_ring_init(ring_t *ring)
{
	size_t buffer_size = TRANSPORT_QUEUE_CAPACITY * sizeof(msg_t *);
	msg_t **msgs;
	bool success;
	if (!buffer_size) {
		msgs = NULL;
		success = true;
	} else {
		msgs = mem_alloc0(buffer_size);
		success = (bool) msgs;
	}
	ring_init(ring, msgs, buffer_size, sizeof(msg_t *));
	return success;
}

static bool transport_set_init(set_t *set)
{
	size_t buffer_size = TRANSPORT_QUEUE_CAPACITY * sizeof(msg_t *);
	msg_t **msgs;
	bool success;
	if (!buffer_size) {
		msgs = NULL;
		success = true;
	} else {
		msgs = mem_alloc0(buffer_size);
		success = (bool) msgs;
	}
	set_init(set, msgs, buffer_size, sizeof(msg_t *));
	return success;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Shared with userspace Data Queue implementation

#ifndef smp_store_release
#define smp_store_release(p, v)						\
do {									\
	barrier();							\
	WRITE_ONCE(*p, v);						\
} while (0)
#endif

#ifndef smp_load_acquire
#define smp_load_acquire(p)						\
({									\
	typeof(*p) ___p1 = READ_ONCE(*p);				\
	barrier();							\
	___p1;								\
})
#endif

#define DATA_QUEUE_ENTRY_AT(queue, v) (data_queue_entry_t*)((uint8_t *)queue->entries + v)

#ifdef ROUND_UP
#undef ROUND_UP
#endif

#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))

static inline void data_queue_write_new_entry(shared_data_queue_t *queue, uint32_t offset, const SiEvent* event)
{
	data_queue_entry_t *entry = DATA_QUEUE_ENTRY_AT(queue, offset);
	memcpy(entry, event, event->Size);
}

// This function is called from data queue 'writer' under the spin_lock as it is NOT thread-safe
// As such, reads from 'queue->tail' can be done use 'READ_ONCE' (relaxed), writes must be done using 'smp_store_release'.
// 'reader' may decide to alter 'queue->head' so 'smp_load_acquire' must be used to read it, writes are NOT allowed.
static bool transport_shared_data_queue_enqueue_impl(transport_t *transport, const SiEvent* event, transport_event_t **deferred_wakeup)
{
	uint32_t head, tail, new_tail;
	uint32_t entry_size = event->Size;
	shared_data_queue_t *queue = transport->queue;
	uint32_t queue_size = transport->queue_size;

	transport->bytes_written += entry_size;

	// Notice that we are not doing any memory shenanigans here to load tail & head.
	// The barriers will be done later if it will appear that they are necessary.
	// !!! 'head' might not be synchronized with 'reader', it is OK and will be handled in the end.
	tail = READ_ONCE(queue->tail);
	head = smp_load_acquire(&queue->head);

	// Check for unreasonable 'tail' or 'head', must never happen.
	if (queue_size < tail || queue_size < head) {
		WPRINTF("Invalid tail/head detected: tail=%u, head=%u, size=%u"
					, (unsigned) tail, (unsigned) head, (unsigned) queue_size);
		return false;
	}

	// Start inserting the contents of 'data' in the shared data queue
	if (tail >= head) {
		uint32_t filled_size = tail - head;
		transport->insert_filled_size = filled_size;
		if (filled_size > transport->insert_filled_size_max)
			transport->insert_filled_size_max = filled_size;

		// Tail is further than head, it is a regular scenario. Handle it
		//      head          tail
		//      V             V
		// -----|*************|-----------------
		//             ^                ^
		//     data to be dequeued      |
		//                          free space
		if ((tail + entry_size) <= queue_size) {
			// There is enough buffer in the 'tail' of the queue, write the entry and move the tail
			//      head          tail    new_tail
			//      V             V       V
			// -----|*************|+++++++|-------
			//                        ^
			//                     new entry
			data_queue_write_new_entry(queue, tail /*off*/, event);
			new_tail = tail + entry_size;
		} else if (head > entry_size) {
			// As first condition did not satisfy, cannot put data after 'tail'
			// Have to loop back to the start and there is enough space before userspace 'head'
			//                      head         tail
			//                      V            V
			// |++++++|------------|*************|?? <- zapped entry w/ size>queue_size-tail, if fits
			// ^      ^
			// off    new_tail

			// Need to explain the userspace that current entry is too long to fit.
			// If there is not enough space to even place a entry header, do nothing.
			// Otherwise, deliberately zap the entry by putting 'data_size' that is too big.
			if ((queue_size - tail) >= DATA_QUEUE_ENTRY_HEADER_SIZE) {
				data_queue_entry_t *entry_to_zap = DATA_QUEUE_ENTRY_AT(queue, tail);
				entry_to_zap->size = entry_size;
				// do not touch 'entry_to_zap->data', it is bogus. entry just says go to the start
			}

			// Write data at the beginning of the queue
			data_queue_write_new_entry(queue, 0 /*off*/, event);
			new_tail = /*off==0 + */ entry_size;
		} else {
			// There is neither enough space after 'tail' nor before 'head', bail
			WPRINTF("no more space is left in " TRANSPORT_FMT ", head=%u, tail=%u, entry_size=%u, written=%llu", TRANSPORT_PRINT(transport), head, tail, entry_size, transport->bytes_written);
			return false;
		}
	} else {
		// Catching to the 'head' from the other size.
		//     tail           head
		//     V              V
		// ****|--------------|***************
		uint32_t filled_size = (queue_size - head) + tail;
		transport->insert_filled_size = filled_size;
		if (filled_size > transport->insert_filled_size_max)
			transport->insert_filled_size_max = filled_size;

		// Insert can still be done if 'head' will not be overrun
		if ((head - tail) > entry_size) {
			//     tail         head
			//     V            V
			// ****|+++++|------|***************
			//           ^
			//           new_tail
			data_queue_write_new_entry(queue, tail, event);
			new_tail = tail + entry_size;
		} else {
			// There is not enough space not to overrun 'head', bail
			WPRINTF("no more space is left in " TRANSPORT_FMT ", head=%u, tail=%u, entry_size=%u, written=%llu", TRANSPORT_PRINT(transport), head, tail, entry_size, transport->bytes_written);
			return false;
		}
	}

	// Expose all the content written in this thread as per 'release' semantics.
	// Reader must do 'smp_store_acquire' on the same variable ('tail') to see 'entries' written.
	// !!! This logic does NOT enforce 'tail' in 'reader' to be equal to 'tail' in 'writer'
	new_tail = ROUND_UP(new_tail, sizeof(uint32_t));
	smp_store_release(&queue->tail, new_tail);

	// The new tail was published to the 'queue' but is it necessary to notify the 'reader'?
	// If in the beginning 'tail == head', it means that userspace has finished reading all the
	// content and is going to wait or is already waiting for the 'event'.
	// In such case it is clear that will must notify the 'reader' no matter what.
	// Moreover 'reader' cannot move the 'head' past the 'tail' so it is guaranteed that it is
	// indeed the latest published 'head' by the reader.

	if (tail != head) {
		// If 'tail != head', it is not as clear. If it so happened that userspace moved the 'head'
		// to be equal to 'tail' while 'writer' was adding the new entry, 'reader' will go 'wait'.
		// So we must refresh the 'head' to ensure we actually do not need to wakeup the 'reader'.
		// The other situation is also valid - 'writer' might delay writes to the head as 'atomic' ops.
		// We need to make sure userspace will continue consuming events as we wrote the 'tail'.
		// Whenever userspace will detect that its current 'tail==head', it will perform 'smb_mb'
		// to fetch the new 'tail' we just wrote to ensure it does not need to consume anymore.
		smp_mb();
		head = READ_ONCE(queue->head);
	}

	if (tail == head) {
		// atomic_ops.rst: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS!
		atomic_set(&transport->queue_event, 1);

		// The data queue was empty, wakeup the 'reader' which is waiting for us.
		// Use 'smp_wmb' to make sure 'tail' that we stored will be seen by the user.
		// It is also fine if we did 'smp_mb' before, we will pair with 'smb_rmb' just fine.
		// Also using 'smp_wmb' to ensure 'atomic_set' did set the 'queue_event'.
		smp_wmb();

		if (deferred_wakeup) {
			*deferred_wakeup = transport_event_ref(transport->event);
		} else {
			wake_up_interruptible(&transport->event->msg_wait_queue);
		}
		TRANSPORT_PRINTF("woken up listener ht=%u nt=%u", tail, new_tail);
	}

	return true;
}

static long transport_queue_events_available(transport_t *transport)
{
	uint32_t tail, head;
	int ev;
	int shutdown;
	shared_data_queue_t *queue = READ_ONCE(transport->queue);

	smp_rmb();
	ev = atomic_xchg(&transport->queue_event, 0);
	if (ev) {
		TRANSPORT_PRINTF("check ev active");
		return 1;
	}

	// This should not be necessary but doing it just in case.
	tail = READ_ONCE(queue->tail);
	head = READ_ONCE(queue->head);
	shutdown = READ_ONCE(transport->shutdown);
	TRANSPORT_PRINTF("check s=%u h=%u t=%u", shutdown, head, tail);

	return shutdown || (head != tail);
}

// This function is called whenever userspace 'reader' deemed that there are no more events to read.
// It will be waiting for the data queue to gain new content using 'msg_wait_queue'.
// 'wake_up_interruptible' does 'wakeup' when it detects that the queue is being empty.
static long transport_data_queue_wait(transport_t *transport)
{
	shared_data_queue_t *queue = READ_ONCE(transport->queue);
	long ret;

	if (!queue) {
		EPRINTF("queue is NULL");
		return -EINVAL;
	}

	if (wait_event_interruptible_exclusive(transport->event->msg_wait_queue, transport_queue_events_available(transport))) {
		ret = -EINTR;
	} else {
		if (READ_ONCE(transport->shutdown)) {
			ret = -EIO;
		} else {
			ret = 0;
		}
	}

	return ret;
}

static int transport_data_queue_mmap(transport_t *transport, struct vm_area_struct *vma)
{
	unsigned long sz = vma->vm_end - vma->vm_start;
	void *ptr;

	// Technically userspace may mmap part of the buffer but
	// it is absolutely going to be a BUG later on when code will manage
	// shared data queue so instead let's complain immediately
	if (0 != vma->vm_pgoff) {
		EPRINTF("mmaped offset is not zero");
		return -EINVAL;
	}

	if (sz != transport->queue_size + DATA_QUEUE_HEADER_SIZE) {
		EPRINTF("mmaped size is invalid, 0x%lx != 0x%x", sz, transport->queue_size);
		return -EINVAL;
	}

	ptr = READ_ONCE(transport->queue);
	if (!ptr) {
		EPRINTF("queue is NULL");
		return -EINVAL;
	}

	return remap_vmalloc_range(vma, transport->queue, 0);
}

static long data_queue_create(const data_queue_params_t *params, shared_data_queue_t **pqueue)
{
	shared_data_queue_t *queue;
	uint32_t size = params->size;

	// Really 'DATA_QUEUE_HEADER_SIZE' is a bare minimum but
	// let's use slightly larger size to fit at least a few messages
	if (size <= TRANSPORT_MINIMAL_SHARED_DATA_QUEUE_SIZE) {
		EPRINTF("size provided is too small");
		return -EINVAL;
	}

	// check if size is PAGE_SIZE aligned because it will later be used in 'mmap'
	if (size & (PAGE_SIZE - 1)) {
		EPRINTF("size is not PAGE_SIZE aligned");
		return -EINVAL;
	}

	queue = (shared_data_queue_t*) vmalloc_user(size);
	if (!queue)
		return -ENOMEM;

	*pqueue = queue;
	return 0;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static bool transport_send_msg_nowait(transport_t *transport, msg_t *msg, transport_event_t **deferred_wakeup)
{
	bool need_wakeup = false;
	const SiEvent *event = &msg->event;

	spin_lock(&transport->msg_spinlock);
	{
		if (READ_ONCE(transport->shutdown)) {
			spin_unlock(&transport->msg_spinlock);
			return false;
		}

		if (FP_SI_CT_WANT_REPLY == event->CallbackType) {
			unsigned item_index;
			if (set_is_full(&transport->sent_msgs_set)) {
				WPRINTF("'sent_msgs_set' overflow for " TRANSPORT_FMT " (capacity=%u)", TRANSPORT_PRINT(transport), set_fetch_capacity(&transport->sent_msgs_set));
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}
			item_index = set_items_count(&transport->sent_msgs_set);
			/*
			    'msg ref/unref' for messages stored in 'sent_msgs_set' are invoked in
			    'msg_reply_wait_count inc/dec'.
			    There is no need for separate 'msg ref/unref' calls.
			*/
			*(msg_t **) set_item_ptr(&transport->sent_msgs_set, item_index) = msg_reply_wait_count_inc(msg);
			set_items_count_set(&transport->sent_msgs_set, item_index + 1);
		}

		if (transport->queue) {
			need_wakeup = false;
			if (!transport_shared_data_queue_enqueue_impl(transport, event, deferred_wakeup)) {
				WPRINTF("mmaped queue overflow for " TRANSPORT_FMT, TRANSPORT_PRINT(transport));
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}
		} else {
			need_wakeup = true;
			if (ring_is_full(&transport->msg_ring)) {
				WPRINTF("message queue overflow for " TRANSPORT_FMT " (capacity=%u)", TRANSPORT_PRINT(transport), ring_capacity(&transport->msg_ring));
				spin_unlock(&transport->msg_spinlock);
				transport_shutdown(transport);
				return false;
			}

			*(msg_t **) ring_producer_ptr(&transport->msg_ring) = msg_ref(msg);
			ring_producer_index_move_one(&transport->msg_ring);
		}
	}
	spin_unlock(&transport->msg_spinlock);

	if (need_wakeup) {
		// wakeup userspace reader
		if (deferred_wakeup)
			*deferred_wakeup = transport_event_ref(transport->event);
		else
			wake_up_interruptible(&transport->event->msg_wait_queue);
	}

	return true;
}

static bool transport_send_hello_nowait(transport_t *transport)
{
	msg_t *msg = hello_msg_new();
	bool success;
	if (!msg) {
		success = false;
	} else {
		success = transport_send_msg_nowait(transport, msg, NULL /*deferred_wakeup*/);
		msg_unref(msg);
	}
	return success;
}

static bool should_send(transport_t* transport, msg_t* msg)
{
	transport_id_t transport_id = transport->transport_id;
	bool ret = true;
	int idx = transport_id_index(transport_id);
	task_info_t* task_info = msg->task_info;
	const SiEvent* event = &msg->event;
	uint16_t operation = event->Operation;

	if (transport->control_tgid == current->tgid) {
		return false;
	}

	// Check if transport needs an event
	if (!(READ_ONCE(transport->events_mask) & MSG_TYPE_TO_EVENT_MASK(operation))) {
		return false;
	}

	if (msg->subtype_mask) {
		if (!(READ_ONCE(transport->events_subtype_inclusion_mask) & msg->subtype_mask)) {
			return false;
		}

		if (READ_ONCE(transport->events_subtype_exclusion_mask) & msg->subtype_mask) {
			return false;
		}
	}

	// Check if current transport does not need the event
	if (msg->file_context_msg_info.skipped_transport_ids[idx] == transport_id) {
		return false;
	}

	// Test task info, dependends on the type of the SiEvent
	// task_info might not be available, in this case do nothing
	if (!task_info) {
		return true;
	}

	if (FP_SI_OT_NOTIFY_PROCESS_EXEC == operation)
	{
		uint64_t pid_version = msg->exec.pid_version;
		ret = task_info_wants_exec_event(task_info, transport_id, pid_version);
	}
	else if (FP_SI_OT_NOTIFY_PROCESS_FORK == operation)
	{
		uint64_t pid_version = msg->fork.pid_version;
		// FORK when task_info is provided is basically an EXEC event so update the pid_version
		(void) task_info_wants_exec_event(task_info, transport_id, pid_version);
		// for FORK event, always send
	}
	else
	{
		uint64_t listening_mask = ~0ULL;
		if (READ_ONCE(task_info->contexts[idx].transport_id) == transport_id)
		{
			listening_mask = READ_ONCE(task_info->contexts[idx].data.listening_mask);
		}

		ret = !!(listening_mask & MSG_TYPE_TO_EVENT_MASK(operation));
	}

	return ret;
}

static bool send_msg_nowait(msg_t *msg)
{
	bool sync = FP_SI_CT_WANT_REPLY == msg->event.CallbackType;
	int i;
	bool sent = false;
	transport_event_t *deferred_wakeups[MAX_TRANSPORT_SIZE];
	int deferred_wakeups_count = 0;

	rcu_read_lock();
	for (i = 0; i < MAX_TRANSPORT_SIZE; i++) {
		transport_t *transport = rcu_dereference(transport_global.transports.transports[i]);
		if (transport && should_send(transport, msg)) {
			deferred_wakeups[deferred_wakeups_count] = NULL;
			sent |= transport_send_msg_nowait(transport, msg, &deferred_wakeups[deferred_wakeups_count]);
			if (deferred_wakeups[deferred_wakeups_count])
			{
				deferred_wakeups_count++;
			}
		}
	}
	rcu_read_unlock();

	for (i = 0; i < deferred_wakeups_count; i++) {
		// Wake up all deferred wakeups, last one wakeup synchronously to switch to it immediately
		if (sync && (i == deferred_wakeups_count - 1)) {
			wake_up_interruptible_sync(&deferred_wakeups[i]->msg_wait_queue);
		} else {
			wake_up_interruptible(&deferred_wakeups[i]->msg_wait_queue);
		}

		transport_event_unref(deferred_wakeups[i]);
	}

	return sent;
}

static transport_t *transport_new(void)
{
	transport_t *transport = mem_alloc0(sizeof(transport_t));
	if (transport) {
		int err;

		transport->transport_id = transport_acquire_id();
		transport->bytes_written = 0;
		transport->insert_filled_size = 0;
		transport->insert_filled_size_max = 0;
		transport->client_type = CLIENT_UNKNOWN;
		if (0 == transport->transport_id)
		{
			EPRINTF("transport %p failed to acquire transport id", transport);
			mem_free(transport);
			return NULL;
		}

		transport->event = transport_event_new();
		if (!transport->event)
		{
			EPRINTF("transport %p failed to allocate event", transport);
			mem_free(transport);
			return NULL;
		}

		acquire_file_context_entry(transport->transport_id);
		// remember client's process doing 'open' to auto-ignore it
		WRITE_ONCE(transport_global.transport_ids.ids[transport_id_index(transport->transport_id)], transport->transport_id);
		transport->control_tgid = current->tgid;

		spin_lock_init(&transport->msg_spinlock);
		WRITE_ONCE(transport->events_mask, MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_HELLO)
		                                 | MSG_TYPE_TO_EVENT_MASK(FP_SI_OT_PONG));
		WRITE_ONCE(transport->events_subtype_inclusion_mask, ~0ULL);
		WRITE_ONCE(transport->events_subtype_exclusion_mask, 0);
		WRITE_ONCE(transport->shutdown, false);
		transport->queue = NULL;
		atomic_set(&transport->queue_event, 0);

		if (transport_ring_init(&transport->msg_ring)
		&& transport_set_init(&transport->sent_msgs_set)
		&& transport_send_hello_nowait(transport)) {
			transport_global_register(transport);
		} else {
			transport_disable(transport);
			transport_free(transport);
			return NULL;
		}

		safe_kobject_init(&transport->skobj);
		err = kobject_init_and_add(&transport->skobj.kobj, &s_transport_ktype, &g_memory_metrics->skobj.kobj, "transport%lld", transport->transport_id);
		if (err) {
			transport_disable(transport);
			kobject_put(&transport->skobj.kobj);
			transport_free(transport);
			return NULL;
		}

		err = sysfs_create_files(&transport->skobj.kobj, s_transport_attrs);
		if (err) {
			transport_disable(transport);
			safe_kobject_del(&transport->skobj);
			transport_free(transport);
			return NULL;
		}
	}
	DPRINTF("transport=%p", transport);
	atomic64_add(1, &g_memory_metrics->total_transports);
	return transport;
}

int __init transport_mod_init(void)
{
	int ret;

	transport_global_init();

	ret = device_mod_init();
	if (ret) {
		EPRINTF("'device_mod_init()' failure %i", ret);
	}
	return ret;
}

void transport_mod_down(void)
{
	DPRINTF("");
	device_mod_down();
	DPRINTF("");
}

static msg_t *transport_lookup_msg_ref(transport_t *transport, msg_id_t reply_id) {
	msg_t* msg = NULL;
	DPRINTF("");
	// TODO DK: Is it possible to use radix tree here instead?
	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			msg_t *query = *(msg_t **) item_ptr;
			if (query->id == reply_id) {
				msg = query;
				msg_ref(msg);
				goto unlock;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
	}
unlock:
	spin_unlock(&transport->msg_spinlock);
	DPRINTF("ret=%p", msg);
	return msg;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static long install_file(struct file *file, int fd, int *pfd)
{
	if (IS_ERR(file)) {
		put_unused_fd(fd);
		return PTR_ERR(file);
	}

	// file is consumed so no need to call 'fput'
	fd_install(fd, file);
	*pfd = fd;

	return 0;
}

static long open_file_with_flags(const char *full_path, int uflags, int mode, int *pfd)
{
	struct file *file;
	int flags;
	int fd;

	DPRINTF("Opening file '%s', uflags '%d', mode '%d'", full_path, uflags, mode);
	fd = get_unused_fd_compat();
	if (fd < 0) {
		return fd;
	}

	flags = uflags
#ifdef O_LARGEFILE
	      | O_LARGEFILE
#endif
#ifdef O_NOATIME
	      | O_NOATIME
#endif
// 'FMODE_NONOTIFY' refers to 'fanotify' not scanning the file.
#ifdef FMODE_NONOTIFY
	      | FMODE_NONOTIFY
#endif
	;

	file = filp_open(full_path, flags, mode);
	return install_file(file, fd, pfd);
}

static inline long open_file(struct path *path, int *pfd)
{
	struct file *file;
	int flags;
	int fd;

	if (!path->dentry && !path->mnt) {
		return -ENOENT;
	}

	fd = get_unused_fd_compat();
	if (fd < 0) {
		return fd;
	}

	flags = O_RDONLY
#ifdef O_LARGEFILE
	      | O_LARGEFILE
#endif
#ifdef O_NOATIME
	      | O_NOATIME
#endif
// 'FMODE_NONOTIFY' refers to 'fanotify' not scanning the file.
#ifdef FMODE_NONOTIFY
	      | FMODE_NONOTIFY
#endif
	;

	file = dentry_open_compat(path, flags);
	if (IS_ERR(file)) {
		// If open failed, let's try to open via the path.
		// Notice that this open will be inside 'client' service context
		// so this 'filp_open' has a good chance of failing as
		// 'mount namespaces' might be different in the process.
		// Perhaps a proper solution would be opening the file inside the original
		// process context, but that would result in having to create 'file'
		// early or to do extra context switches to the scanned process.
		// Either way seems inefficient so it is currently avoided.
		size_t size = PAGE_SIZE;
		char *buf = __getname();
		const char *full_path;
		if (!buf) {
			return -ENOMEM;
		}

		full_path = d_path(path, buf, size);
		if (!IS_ERR(full_path)) {
			file = filp_open(full_path, flags, 0);
		}
		__putname(buf);
	}

	return install_file(file, fd, pfd);
}

static long transport_ioctl_handle_open_file_from_msg(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	msg_t* msg;
	request_msg_img_t *msg_img = MSG_IMG(query_msg);
	open_file_from_msg_img_t *img = IMG_PAYLOAD(msg_img);

	if (MSG_SIZE(query_msg) < sizeof(request_msg_img_t) + sizeof(open_file_from_msg_img_t)) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		return -EINVAL;
	}

	msg = transport_lookup_msg_ref(transport, MSG_ID(query_msg));
	if (!msg) {
		ret = -ESRCH;
	} else {
		int fd = -1;
		struct path path;

		thread_safe_path_load(img->num == 0 ? &msg->path : &msg->path2, &path);
		msg_unref(msg);
		ret = open_file(&path, &fd);
		path_put(&path);
		if (0 == ret) {
			ret = open_file_return_msg_new(reply_msg, fd);
		}
	}

	return ret;
}

static long transport_ioctl_handle_open_file_by_path(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	char *path;
	size_t pathSize;
	int fd = -1;
	request_msg_img_t *msg_img = MSG_IMG(query_msg);
	open_file_by_path_img_t *img = IMG_PAYLOAD(msg_img);

	(void) transport;
	if (MSG_SIZE(query_msg) <= sizeof(request_msg_img_t) + sizeof(open_file_by_path_img_t)) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		return -EINVAL;
	}

	path = img->path;
	pathSize = MSG_SIZE(query_msg) - (sizeof(request_msg_img_t) + sizeof(open_file_by_path_img_t));
	path[pathSize - 1] = '\0';
	ret = open_file_with_flags(path, img->flags, img->mode, &fd);
	if (0 == ret) {
		ret = open_file_return_msg_new(reply_msg, fd);
	}

	return ret;
}

static long transport_ioctl_handle_get_version(msg_varsized_t *reply_msg)
{
	return version_info_return_msg_new(reply_msg);
}

static long transport_ioctl_handle_data_queue_init(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	request_msg_img_t *msg_img = MSG_IMG(query_msg);
	data_queue_params_t *params = IMG_PAYLOAD(msg_img);
	long err;
	shared_data_queue_t *queue;
	uint32_t queue_size;

	if (MSG_SIZE(query_msg) < sizeof(request_msg_img_t) + sizeof(data_queue_params_t)) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		return -EINVAL;
	}

	err = data_queue_create(params, &queue);
	if (err) {
		return err;
	}

	queue_size = params->size - DATA_QUEUE_HEADER_SIZE;
	{
		spin_lock(&transport->msg_spinlock);
		if (transport->queue) {
			spin_unlock(&transport->msg_spinlock);
			vfree(queue);
			return -EEXIST;
		}

		transport->queue = queue;
		transport->queue_size = queue_size;
		spin_unlock(&transport->msg_spinlock);
	}

	return data_queue_offsets_return_msg_new(reply_msg, queue_size);
}

static int mnt_info_ret_msg_new(msg_varsized_t *msg, bool ok)
{
	size_t msg_img_size;
	request_msg_img_t *msg_img;
	mnt_info_ret_img_t *mnt_info_ret_img;
	msg_sized_t *smsg;

	msg_img_size = sizeof(request_msg_img_t) + sizeof(mnt_info_ret_img_t);

	smsg = msg_varsized_init(msg, msg_img_size);
	if (smsg)
	{
		MSG_TYPE(smsg) = RT_GET_MNT_ID_OFFSET;
	}

	if (!smsg)
	{
		return -ENOMEM;
	}

	msg_img = MSG_IMG(smsg);
	mnt_info_ret_img = IMG_PAYLOAD(msg_img);
	mnt_info_ret_img->ok = ok;

	return 0;
}

#define MNT_ID_FIND_BUFFER_LEN (128)

static inline bool file_ok(struct file *file)
{
	struct dentry *dentry;
	struct inode *inode;
	if (!file)
		return false;

	dentry = file->f_path.dentry;
	if (!dentry)
		return false;

	inode = dentry->d_inode;
	if (!inode)
		return false;

	if (!inode->i_sb)
		return false;

	return true;
}

#ifndef VFSMOUNT_HAS_MNT_ID
// return 0 on error, offset on success
static int find_mnt_id_offset(mnt_info_img_t *img)
{
	uint32_t i = 0;
	uintptr_t info_ptr = (uintptr_t)img->data;
	// offset[-MNT_ID_FIND_BUFFER_LEN, -1] => offset_map[0, MNT_ID_FIND_BUFFER_LEN - 1],
	// offset[1, MNT_ID_FIND_BUFFER_LEN]   => offset_map[MNT_ID_FIND_BUFFER_LEN + 1, MNT_ID_FIND_BUFFER_LEN * 2]
	DECLARE_BITMAP(offset_map, MNT_ID_FIND_BUFFER_LEN * 2 + 1);
	bitmap_fill(offset_map, MNT_ID_FIND_BUFFER_LEN * 2 + 1);

	if (img->count <= 0)
	{
		return 0;
	}

	for (i = 0; i < (sizeof(struct vfsmount) / sizeof(int)); i++)
	{
		clear_bit(i + MNT_ID_FIND_BUFFER_LEN, offset_map);
	}

	for (i = 0; i < img->count; i++)
	{
		struct compat_fd fd;
		mnt_info_t *info = (mnt_info_t *)info_ptr;
		info_ptr += (sizeof(mnt_info_t));

		fd = compat_fdget(info->fd);
		if (file_ok(fd_file(fd)))
		{
			struct path path = fd_file(fd)->f_path;
			int *mnt_ptr = (int *)path.mnt;
			int count = 0;
			while (count <= MNT_ID_FIND_BUFFER_LEN)
			{
				int *mnt_id = (int *)(mnt_ptr + count);
				if (*mnt_id != info->mntId)
				{
					clear_bit(count + MNT_ID_FIND_BUFFER_LEN, offset_map);
				}
				mnt_id = (int *)(mnt_ptr - count);
				if (*mnt_id != info->mntId)
				{
					clear_bit(MNT_ID_FIND_BUFFER_LEN - count, offset_map);
				}

				count++;
			}
			compat_fdput(fd);
		}
	}

	for (i = 0; i < MNT_ID_FIND_BUFFER_LEN * 2; i++)
	{
		if (test_bit(i, offset_map))
		{
			return i - MNT_ID_FIND_BUFFER_LEN;
		}
	}

	return 0;
}

int global_mnt_id_offset = 0;
#endif

static long transport_ioctl_handle_mnt_info(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	int ret;
	int offset = 0;
	request_msg_img_t *msg_img = MSG_IMG(query_msg);
	mnt_info_img_t *img = IMG_PAYLOAD(msg_img);
	(void)transport;

	if (MSG_SIZE(query_msg) < sizeof(request_msg_img_t) + sizeof(mnt_info_img_t)) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		return -EINVAL;
	}

#ifdef VFSMOUNT_HAS_MNT_ID
	(void) img; (void) offset;
	ret = mnt_info_ret_msg_new(reply_msg, true);
#else
	offset = find_mnt_id_offset(img);
	IPRINTF("mnt offset in vfsmount: %d", offset);
	if (offset != 0)
	{
		WRITE_ONCE(global_mnt_id_offset, offset);
		ret = mnt_info_ret_msg_new(reply_msg, true);
	}
	else
	{
		ret = mnt_info_ret_msg_new(reply_msg, false);
	}
#endif

	return ret;
}

static long transport_ioctl_process_info(msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;

	if (query_msg->img_size < (sizeof(request_msg_img_t) + sizeof(get_process_info_img_t))) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		ret = -EINVAL;
	} else {
		request_msg_img_t *msg_img = MSG_IMG(query_msg);
		get_process_info_img_t *img = IMG_PAYLOAD(msg_img);
		pid_t pid = img->pid;
		ret = process_info_return_msg_new(reply_msg, pid);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_process_pid_version(msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;

	if (query_msg->img_size < (sizeof(request_msg_img_t) + sizeof(get_process_info_img_t))) {
		EPRINTF("'%s' message is too short", action_type_to_string(MSG_TYPE(query_msg)));
		ret = -EINVAL;
	} else {
		request_msg_img_t *msg_img = MSG_IMG(query_msg);
		get_process_info_img_t *img = IMG_PAYLOAD(msg_img);
		pid_t pid = img->pid;
		ret = process_pid_version_return_msg_new(reply_msg, pid);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_write_read_msg(transport_t *transport, msg_varsized_t *reply_msg, msg_sized_t *query_msg)
{
	long ret;
	action_type_t action_type = 0;

	if (MSG_REPLY(query_msg)) {
		EPRINTF("'reply' ioctl is not supported");
		ret = -EINVAL;
		goto out;
	}

	action_type = MSG_TYPE(query_msg);
	switch (action_type) {
	case AT_OPEN_FILE_FROM_MSG:
		ret = transport_ioctl_handle_open_file_from_msg(transport, reply_msg, query_msg);
		break;

	case AT_OPEN_FILE_BY_PATH:
		ret = transport_ioctl_handle_open_file_by_path(transport, reply_msg, query_msg);
		break;

	case AT_GET_VERSION:
		ret = transport_ioctl_handle_get_version(reply_msg);
		break;

	case AT_INIT_SHARED_DATA_QUEUE:
		ret = transport_ioctl_handle_data_queue_init(transport, reply_msg, query_msg);
		break;

	case AT_GET_MNT_ID_OFFSET:
		ret = transport_ioctl_handle_mnt_info(transport, reply_msg, query_msg);
		break;

	case AT_GET_PROCESS_INFO:
		ret = transport_ioctl_process_info(reply_msg, query_msg);
		break;

	case AT_GET_PROCESS_PID_VERSION:
		ret = transport_ioctl_process_pid_version(reply_msg, query_msg);
		break;

	default:
		EPRINTF("Unexpected '%s' message", action_type_to_string(action_type));
		HEX_DUMP("query_msg: ", MSG_IMG(query_msg), MSG_SIZE(query_msg));
		ret = -EINVAL;
		break;
	}
out:
	DPRINTF("action_type=%d ret=%li", (int) action_type, ret);
	return ret;
}

static long transport_ioctl_copy_from_user(ioctl_hdr_t *ioctl_hdr,
		msg_varsized_t *query_msg, void __user *user_data)
{
	long ret;

	size_t msg_size;
	msg_sized_t *msg;
	request_msg_img_t *msg_img;
	void *payload;

	if (copy_from_user(ioctl_hdr, user_data, sizeof(ioctl_hdr_t))) {
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	msg_size = ioctl_hdr->size;
	if (msg_size < sizeof(request_msg_img_t)) {
		EPRINTF("message image is too small");
		ret = -EINVAL;
		goto out;
	}
	if (msg_size > TRANSPORT_MSG_SIZE_MAX) {
		EPRINTF("size > TRANSPORT_MSG_SIZE_MAX");
		ret = -E2BIG;
		goto out;
	}
	msg = msg_varsized_init(query_msg, msg_size);
	if (!msg) {
		ret = -ENOMEM;
		goto out;
	}
	msg_img = MSG_IMG(msg);
	payload = (uint8_t *)user_data + sizeof(ioctl_hdr_t);
	if (copy_from_user(msg_img, payload, msg_size)) {
		msg_varsized_uninit(query_msg);
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	ret = 0;
out:
	DPRINTF("ret=%li", ret);
	return ret;
}

static long transport_ioctl_copy_to_user(ioctl_hdr_t *ioctl_hdr,
		msg_sized_t *reply_msg, void __user *user_data)
{
	long ret;

	size_t msg_size = MSG_SIZE(reply_msg);
	size_t capacity;
	void *payload;
	request_msg_img_t *msg_img;

	ioctl_hdr->size = msg_size;
	if (copy_to_user(user_data, ioctl_hdr, sizeof(ioctl_hdr_t))) {
		EPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	capacity = ioctl_hdr->capacity;
	if (capacity < msg_size) {
		WPRINTF("capacity=%zu < msg_size=%zu", capacity, msg_size);
		ret = -ENOSPC;
		goto out;
	}
	payload = (uint8_t *)user_data + sizeof(ioctl_hdr_t);
	msg_img = MSG_IMG(reply_msg);
	if (copy_to_user(payload, msg_img, msg_size)) {
		EPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		goto out;
	}
	ret = 0;
out:
	DPRINTF("ret=%li", ret);
	return ret;
}

long transport_device_ioctl(struct file *filp, unsigned int cmd,
		unsigned long arg)
{
	transport_t *transport = filp->private_data;
	long ret;
	if (READ_ONCE(transport->shutdown)) {
		ret = -EIO;
		goto out;
	}
	switch (cmd) {
	case IOCTL_WRITE_AND_READ_MSG:
	case IOCTL_READ_VERSION:
	{
		ioctl_hdr_t ioctl_hdr;
		void *user_data = (void *)arg;
		msg_varsized_t query_msg;
		ret = transport_ioctl_copy_from_user(&ioctl_hdr, &query_msg, user_data);
		if (!ret) {
			msg_varsized_t reply_msg;
			ret = transport_ioctl_write_read_msg(transport, &reply_msg, MSG_VARSIZED_GET_SIZED(&query_msg));
			if (!ret) {
				ret = transport_ioctl_copy_to_user(&ioctl_hdr, MSG_VARSIZED_GET_SIZED(&reply_msg), user_data);
				msg_varsized_uninit(&reply_msg);
			}
			msg_varsized_uninit(&query_msg);
		}
		break;
	}
	default:
		EPRINTF("Unexpected IOCTL cmd=%u", cmd);
		ret = -ENOIOCTLCMD;
	}
out:
	if (-EINVAL == ret) {
		EPRINTF("ioctl failed with EINVAL, dropping the transport");
		transport_shutdown(transport);
	}
	DPRINTF("ret=%li", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

ssize_t transport_device_read(struct file *filp, char __user *user_data,
			      size_t size, loff_t *offset)
{
	msg_t *msg;
	transport_t *transport = filp->private_data;
	size_t img_size;
	ssize_t ret;

	(void) offset;
	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

	if (!size) {
		EPRINTF("'empty read' is not supported");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

retry_wait:
	// We may start with 'wait*()' because it itself starts
	// with 'condition' check.
	if (wait_event_interruptible_exclusive(transport->event->msg_wait_queue,
			READ_ONCE(transport->shutdown)
			|| !ring_is_empty(&transport->msg_ring))) {
		ret = -EINTR;
		goto out;
	}

	// Lock the state and check if processing is actually possible.
	spin_lock(&transport->msg_spinlock);
	{
		if (READ_ONCE(transport->shutdown)) {
			ret = -EIO;
			spin_unlock(&transport->msg_spinlock);
			goto out;
		}

		if (ring_is_empty(&transport->msg_ring)) {
			WPRINTF("wakeup without messages");
			spin_unlock(&transport->msg_spinlock);
			goto retry_wait;
		}
		msg = *(msg_t **) ring_consumer_ptr(&transport->msg_ring);
		img_size = msg->event.Size;
		DPRINTF("size=%zu img_size=%zu", size, img_size);
		if (size < img_size) {
			ret = -ENOSPC;
			spin_unlock(&transport->msg_spinlock);
			goto out;
		}
		ring_consumer_index_move_one(&transport->msg_ring);
	}
	spin_unlock(&transport->msg_spinlock);

	// 'copy_to_user' MAY sleep (for example in page fault handler)
	if (copy_to_user(user_data, &msg->event, img_size)) {
		WPRINTF("'copy_to_user()' failure");
		ret = -EFAULT;
		transport_shutdown(transport);
	} else {
		ret = img_size;
	}
	msg_unref(msg);
out:
	DPRINTF("ret=%zi", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Forward declaration, to be available in the end of the code
static long wait_msg_killable_timeout(msg_t* msg, unsigned long timeout_jiffies);

static void msg_wait_reply(msg_t *msg)
{
	long ret;

	// We may start with 'wait*()' because it itself starts
	// with 'condition' check.
	DPRINTF("waiting for userspace reply...");
	ret = wait_msg_killable_timeout(msg, msecs_to_jiffies(TRANSPORT_WAIT_REPLY_TIMEOUT_MSECS));
	if (!ret) {
		msg_type_t operation = (msg_type_t) msg->event.Operation;
		// Timeout here means unexpected issue with userspace.
		FPRINTF("timeout waiting for userspace reply (msg_type=%d/%s)", operation, msg_type_to_string(operation));
		HEX_DUMP("msg: ", &msg->event, msg->event.Size);
		dump_stack();
		// identify and shutdown transport failed to reply
		transport_global_shutdown_msg(msg);
	} else if (ret < 0) {
		// Calling process has been interrupted as SIGKILL was received.
		// In practice this means 'block'.
		DPRINTF("message was interrupted...");
		msg->interrupted = true;
	} else {
		// Userspace reply has been received (msg->reply_msg) or
		// waiting has been explicitly aborted (msg->aborted) for
		// example on userspace disconnect.
		DPRINTF("wait finished (msg->block=%i, wc=%d)", msg->block, atomic_read(&msg->reply_wait_count));
	}
}

void send_msg_async(msg_t *msg)
{
	DPRINTF("msg=%p", msg);
	send_msg_nowait(msg);
	DPRINTF("");
}

void send_msg_async_unref_unchecked(msg_t *msg)
{
	send_msg_async(msg);
	msg_unref(msg);
}

static void msg_mark_sync(msg_t *msg)
{
	msg->event.CallbackType = FP_SI_CT_WANT_REPLY;
}

static bool send_msg_sync_nowait(msg_t *msg)
{
	bool sent;
	DPRINTF("msg=%p", msg);
	msg_mark_sync(msg);
	sent = send_msg_nowait(msg);
	DPRINTF("msg=%p sent=%i", msg, sent);
	return sent;
}

void send_msg_sync(msg_t *msg)
{
	DPRINTF("msg=%p", msg);
	if (send_msg_sync_nowait(msg)) {
		msg_wait_reply(msg);
	}
	DPRINTF("");
}

void send_msg_sync_unref_unchecked(msg_t *msg)
{
	send_msg_sync(msg);
	thread_safe_path_clear(&msg->path);
	// TODO: Why path2 is not cleared?
	msg_unref(msg);
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static int transport_handle_ping_msg(transport_t *transport, msg_sized_t *ping)
{
	int ret;
	msg_t *msg;
	bool sync;
	uint64_t event_uid;
	transport_event_t *event = NULL;

	if (ping->img_size < (sizeof(request_msg_img_t) + sizeof(ping_img_t))) {
		DPRINTF("'ping' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	event_uid = transport_global_sequence_next();
	msg = pong_msg_new(ping, event_uid);
	if (!msg) {
		ret = -ENOMEM;
		goto out;
	}

	// reflect ping's 'reply' policy
	sync = !!MSG_ID(ping);
	if (sync) {
		msg->event.CallbackType = FP_SI_CT_WANT_REPLY;
	}

	transport_send_msg_nowait(transport, msg, &event);
	if (event) {
		wake_up_interruptible_sync(&event->msg_wait_queue);
		transport_event_unref(event);
	}
	msg_wait_reply(msg);

	ret = 0;
	msg_unref(msg);

out:
	return ret;
}

static int transport_handle_set_listening_mask_process(transport_t *transport, msg_sized_t *msg)
{
	request_msg_img_t *msg_img;
	process_set_listening_mask_img_t *img;
	pid_t pid;
	uint64_t events_mask;
	int ret;
	task_info_t *info;
	uint64_t unique_pid;
	uint64_t pid_version;

	if (msg->img_size < (sizeof(request_msg_img_t) + sizeof(process_set_listening_mask_img_t))) {
		DPRINTF("'pid' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	pid = img->pid;
	events_mask = img->events_mask;
	unique_pid = img->unique_pid;
	pid_version = img->pid_version;
	// If user gives us 'pid_version', do not attempt to do weird gets, fetch the current info in the map.
	if (0 == pid_version)
		info = task_info_map_get_by_pid(pid, unique_pid);
	else
		info = task_info_lookup(pid, unique_pid);

	if (info) {
		ret = task_info_set_listening_mask(info, transport->transport_id, events_mask, pid_version);
		task_info_put(info);
	} else {
		ret = -ESRCH;
	}

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

// This msg is received when userspace is processing a event msg
static int transport_handle_file_context_add_msg(transport_t *transport, msg_sized_t *add_msg)
{
	request_msg_img_t *add_msg_img;
	msg_t *msg;
	int ret = 0;

	if (add_msg->img_size < (sizeof(request_msg_img_t)))
	{
		DPRINTF("message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	add_msg_img = MSG_IMG(add_msg);
	msg = transport_lookup_msg_ref(transport, add_msg_img->id);
	if (!msg)
	{
		ret = -ESRCH;
	}
	else
	{
		// At this moment, the event msgs sent with sync is still exist,
		// we can utilize their informaton here.
		if (msg->file_context_msg_info.key.file_key.ptr != 0)
		{
			int flags = 0;
			file_context_info_t info = {0};
			uint16_t operation = msg->event.Operation;
			if (FP_SI_OT_SYNC_FILE_PRE_OPEN == operation || FP_SI_OT_SYNC_FILE_PRE_WRITE == operation)
			{
				info.pid_version = msg->open.pid_version;
				flags = msg->open.flags;
				info.low = msg->write.low;
				info.high = msg->write.high;
			}
			else
			{
				EPRINTF("%s: unsupported msg type", __func__);
				msg_unref(msg);
				return -EFAULT;
			}
			if ((flags & O_ACCMODE) <= O_RDWR)
			{
				flags += 1;
			}

			info.msg_info.key = msg->file_context_msg_info.key;
			if (FP_SI_OT_SYNC_FILE_PRE_OPEN == operation)
			{
				file_context_open_file_t *file_node = NULL;
				file_context_open_process_t *process_node = NULL;
				add_open_cache(transport->transport_id, &info, &file_node, &process_node);
				if (process_node)
				{
					atomic_or_compat(flags, &process_node->flags);
				}
				put_open_cache(file_node, process_node);
			}
			else if (FP_SI_OT_SYNC_FILE_PRE_WRITE == operation)
			{
				file_context_rw_t *node = NULL;
				node = add_rw_cache(transport->transport_id, &info, FILE_CONTEXT_WRITE_TABLE);
				if (node)
				{
					put_rw_cache(node);
				}
			}
		}
		msg_unref(msg);
	}
out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_set_listening_mask_global(transport_t *transport, msg_sized_t *msg)
{
	request_msg_img_t *msg_img;
	events_mask_img_t *img;
	uint64_t mask;
	int ret;

	if (msg->img_size < (sizeof(request_msg_img_t) + sizeof(events_mask_img_t))) {
		DPRINTF("'events' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	mask = img->events_mask;

	spin_lock(&transport->msg_spinlock);
	{
		if (READ_ONCE(transport->shutdown)) {
			// Do not allow changing the mask when shutdown.
			// Transport will not be able to receive any events.
			ret = -EFAULT;
		} else {
			WRITE_ONCE(transport->events_mask, mask);
			ret = 0;
		}
	}
	spin_unlock(&transport->msg_spinlock);

	transport_global_recalculate_combined_all_event_masks();

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_set_listening_subtype_mask(transport_t *transport, uint64_t* target, msg_sized_t *msg)
{
	request_msg_img_t *msg_img;
	events_mask_img_t *img;
	uint64_t mask;
	int ret;

	if (msg->img_size < (sizeof(request_msg_img_t) + sizeof(events_mask_img_t))) {
		DPRINTF("'events' message is too short. ignoring it.");
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	mask = img->events_mask;

	spin_lock(&transport->msg_spinlock);
	{
		if (READ_ONCE(transport->shutdown)) {
			// Do not allow changing the mask when shutdown.
			// Transport will not be able to receive any events.
			ret = -EFAULT;
		} else {
			WRITE_ONCE(*target, mask);
			ret = 0;
		}
	}
	spin_unlock(&transport->msg_spinlock);

	transport_global_recalculate_combined_all_event_masks();

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

static int transport_handle_set_client_type(transport_t *transport, msg_sized_t *msg)
{
	request_msg_img_t *msg_img;
	transport_client_type_img_t *img;
	int ret = 0;
	char* name;

	if (msg->img_size < (sizeof(request_msg_img_t) + sizeof(transport_client_type_img_t))) {
		ret = -EINVAL;
		goto out;
	}

	msg_img = MSG_IMG(msg);
	img = IMG_PAYLOAD(msg_img);
	transport->client_type = img->client_type;
	name = (char*) mem_alloc(256);
	if (name) {
		snprintf(name, 256, "transport%lld_%s", transport->transport_id, transport_name(transport));
		ret = kobject_rename(&transport->skobj.kobj, name);
		mem_free(name);
	}

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

// FIXME: do something with 'reply'. For example merge several replies
// into one; link replies into list; extract 'responces' and merge them.
static void handle_reply(msg_t *query_msg, msg_sized_t *reply_msg)
{
	// handle 'long' 'reply'
	size_t headers_size = sizeof(request_msg_img_t) + sizeof(reply_img_t);
	// Note: for compatibility with legacy short 'reply_img_t' default 'reply_type' is RT_ALLOW
	if (MSG_SIZE(reply_msg) >= headers_size) {
		request_msg_img_t *reply_msg_img = MSG_IMG(reply_msg);
		reply_img_t *reply_img = IMG_PAYLOAD(reply_msg_img);
		reply_type_t reply_type = reply_img->type;
		DPRINTF("MSG_SIZE(reply_msg)=%zu - headers_size=%zu = %zu reply_type=%u",
				MSG_SIZE(reply_msg), headers_size,
				MSG_SIZE(reply_msg) - headers_size, reply_type);
		if (RT_BLOCK == reply_type) {
			query_msg->block = true;
		}
	}
}

static int transport_handle_reply(transport_t *transport, msg_sized_t *reply)
{
	msg_id_t reply_id = MSG_ID(reply);
	msg_type_t reply_type = MSG_TYPE(reply);
	msg_t* msg = NULL;

	DPRINTF("%lu %d", reply_id, reply_type);
	// find 'query' matching this 'reply'
	spin_lock(&transport->msg_spinlock);
	{
		void *item_ptr = set_begin_ptr(&transport->sent_msgs_set);
		void *end_ptr = set_end_ptr(&transport->sent_msgs_set);
		while (item_ptr < end_ptr) {
			msg_t *query = *(msg_t **) item_ptr;
			if (query->id == reply_id) {
				// remove 'query' from 'set'
				*(msg_t **) item_ptr = *(msg_t **) set_item_ptr(
						&transport->sent_msgs_set,
						set_items_count_dec(&transport->sent_msgs_set));
				msg = query;
				goto unlock;
			}
			item_ptr = set_ptr_next(&transport->sent_msgs_set, item_ptr);
		}
		WPRINTF("Unexpected 'reply' with type=%i id=%llX", reply_type, reply_id);
	}
unlock:
	spin_unlock(&transport->msg_spinlock);

	if (msg) {
		handle_reply(msg, reply);
		msg_reply_wait_count_dec(msg);
	}

	return msg ? 0 : -ESRCH;
}

static int transport_handle_msg(transport_t *transport, msg_sized_t *msg)
{
	int ret;

	if (msg->img_size < sizeof(request_msg_img_t)) {
		DPRINTF("message image is too small");
		ret = -EINVAL;
		goto out;
	}

	if (MSG_REPLY(msg)) {
		ret = transport_handle_reply(transport, msg);
	} else { // !reply
		action_type_t type = MSG_TYPE(msg);
		DPRINTF("type=%i", type);
		switch (type) {
		case AT_PING:
			ret = transport_handle_ping_msg(transport, msg);
			break;

		case AT_WAIT_SHARED_DATA_QUEUE:
			ret = transport_data_queue_wait(transport);
			break;

		case AT_FILE_CONTEXT_ADD:
			ret = transport_handle_file_context_add_msg(transport, msg);
			break;

		case AT_SET_LISTENING_MASK_GLOBAL:
			ret = transport_handle_set_listening_mask_global(transport, msg);
			break;

		case AT_SET_LISTENING_MASK_PROCESS:
			ret = transport_handle_set_listening_mask_process(transport, msg);
			break;

		case AT_SET_LISTENING_SUBTYPE_INCLUSION_MASK:
			ret = transport_handle_set_listening_subtype_mask(transport, &transport->events_subtype_inclusion_mask, msg);
			break;

		case AT_SET_LISTENING_SUBTYPE_EXCLUSION_MASK:
			ret = transport_handle_set_listening_subtype_mask(transport, &transport->events_subtype_exclusion_mask, msg);
			break;

		case AT_SET_TRANSPORT_CLIENT_TYPE:
			ret = transport_handle_set_client_type(transport, msg);
			break;

		default:
			WPRINTF("Unexpected message type=%i/%s", type, action_type_to_string(type));
			ret = -EINVAL;
		}
	}

out:
	DPRINTF("ret=%i", ret);
	return ret;
}

ssize_t transport_device_write(struct file *filp, const char __user *user_data,
			       size_t size, loff_t *offset)
{
	transport_t *transport = filp->private_data;
	msg_varsized_t msg;
	msg_sized_t* smsg;
	request_msg_img_t *msg_img;
	ssize_t ret;

	(void) offset;
	if (READ_ONCE(transport->shutdown)) {
		ret = -EIO;
		goto out;
	}

	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}

	if (!size) {
		WPRINTF("'zero write' is not supported for " TRANSPORT_FMT, TRANSPORT_PRINT(transport));
		ret = -EINVAL;
		transport_shutdown(transport);
		goto out;
	}
	if (size > TRANSPORT_MSG_SIZE_MAX) {
		WPRINTF("size > TRANSPORT_MSG_SIZE_MAX");
		ret = -E2BIG;
		goto out;
	}
	smsg = msg_varsized_init(&msg, size);
	if (!smsg) {
		ret = -ENOMEM;
		goto out;
	}
	msg_img = MSG_IMG(smsg);
	if (copy_from_user(msg_img, user_data, size)) {
		EPRINTF("'copy_from_user()' failure");
		ret = -EFAULT;
		transport_shutdown(transport);
		goto free_msg;
	}
	ret = transport_handle_msg(transport, smsg);
	if (ret) {
		// make sure error code is negative
		if (ret > 0) {
			EPRINTF("error code must be negative");
			ret = -ret;
		}
		goto free_msg;
	}
	ret = size;
free_msg:
	msg_varsized_uninit(&msg);
out:
	DPRINTF("ret=%zi", ret);
	return ret;
}

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

/*
    Warning: 'transport_open()' and 'transport_release()' may be
    simultaneously invoked by several threads or processes.

    Note: We can match different 'transport' instances using 'device'
    'major'/'minor' from 'inode->i_rdev'. Pointer to selected 'trasport'
    can be stored in 'filp->private_data' for later use in '*_read()',
    '*_write()', etc.

    Note: We may create 'transport' on 'first open' and destroy it on
    'last close'.
*/
/*
    There is possibility of 'deadlock' between our 'kernel' and
    'userspace' code while processing events generated by our userspace
    process until registration of our userspace process in 'ignore' list.
*/
int transport_device_open(struct inode *inode, struct file *filp)
{
	bool ok;
	transport_t *transport;
	int ret;
	bool safe_mode;

#ifdef FMODE_NONOTIFY
	// We are using 'fsnotify' ourselves so avoid raising 'fsnotify' events
	filp->f_mode |= FMODE_NONOTIFY;
#endif

	DPRINTF("inode->i_rdev: major=%u minor=%u", imajor(inode), iminor(inode));
	DPRINTF("filp->f_flags=%X", filp->f_flags);
	if (filp->f_flags & O_NONBLOCK) {
		EPRINTF("'non-blocking' mode is not supported yet");
		ret = -EINVAL;
		goto out;
	}

	safe_mode = filp->f_flags & O_NOCTTY;

#ifndef HOOK_LIST_USE_HLIST
	if (safe_mode) {
		EPRINTF("safe mode requires LSM hook list");
		ret = -EINVAL;
		goto out;
	}
#endif

#ifndef CONFIG_SECURITY_PATH
	if (safe_mode) {
		EPRINTF("safe mode requires LSM security path");
		ret = -EINVAL;
		goto out;
	}
#endif

#ifndef FMODE_NONOTIFY
	if (safe_mode) {
		EPRINTF("safe mode requires fanotify FMODE_NONOTIFY flag");
		ret = -EINVAL;
		goto out;
	}
#endif

	mutex_lock(&transport_global.transport_mutex);
	{
		DPRINTF("transport_count=%u", transport_global.transport_count);

		transport = transport_new();
		if (!transport) {
			WPRINTF("'%s()' failure", "transport_new");
			ret = -ENOMEM;
			goto unlock_open_close_mutex;
		}
		filp->private_data = transport;

		if (!transport_global.transport_count) {
			// FIXME: 'attach' may fail
			IPRINTF("attaching interceptors");
			ret = acquire_file_modify_entry();
			if (ret != 0)
			{
				goto unlock_open_close_mutex;
			}
			mod_rundown_protection_set_ready();
			fsnotify_events_listener_init();
			register_ftrace_post_events();
			lsm_hooks_init();
			ret = syscall_hooks_attach(safe_mode);
			if (ret) {
				EPRINTF("'%s()' failure %i", "syscall_hooks_attach", ret);
				lsm_hooks_exit();
				unregister_ftrace_post_events();
				mod_rundown_protection_set_rundown_active();
				ok = mod_rundown_protection_wait_for_rundown_timeout(msecs_to_jiffies(TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS));
				if (!ok) {
					WPRINTF("Failed to wait for module rundown");
				}
				fsnotify_events_listener_deinit();
				transport_disable(transport);
				transport_del(transport);
				filp->private_data = NULL;
				release_file_modify_entry();
				goto unlock_open_close_mutex;
			}
			ret = tracepoints_attach();
			if (ret) {
				EPRINTF("'%s()' failure %i", "tracepoints_attach", ret);
				syscall_hooks_detach();
				lsm_hooks_exit();
				unregister_ftrace_post_events();
				mod_rundown_protection_set_rundown_active();
				ok = mod_rundown_protection_wait_for_rundown_timeout(msecs_to_jiffies(TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS));
				if (!ok) {
					WPRINTF("Failed to wait for module rundown");
				}
				fsnotify_events_listener_deinit();
				transport_disable(transport);
				transport_del(transport);
				filp->private_data = NULL;
				release_file_modify_entry();
				goto unlock_open_close_mutex;
			}
			IPRINTF("interceptors attached");
		}
		++transport_global.transport_count;
		ret = 0;
	}
unlock_open_close_mutex:
	mutex_unlock(&transport_global.transport_mutex);
out:
	DPRINTF("ret=%i", ret);
	if (ret)
		return ret;

#ifdef FMODE_STREAM
	return stream_open(inode, filp);
#else
#ifdef FMODE_ATOMIC_POS
	filp->f_mode &= ~(FMODE_ATOMIC_POS);
#endif
	return nonseekable_open(inode, filp);
#endif
}

// 'release()' means 'close()'
int transport_device_release(struct inode *inode, struct file *filp)
{
	bool ok;
	transport_t *transport = filp->private_data;

	(void) inode;
	mutex_lock(&transport_global.transport_mutex);
	{
		transport_disable(transport);
		transport_global_recalculate_combined_all_event_masks_impl();

		DPRINTF("transport_count=%u", transport_global.transport_count);
		if (!--transport_global.transport_count) {
			IPRINTF("detaching interceptors");
			tracepoints_detach();
			// FIXME: 'syscall_hooks_detach()' may fail
			syscall_hooks_detach();
			lsm_hooks_exit();
			unregister_ftrace_post_events();
			mod_rundown_protection_set_rundown_active();
			ok = mod_rundown_protection_wait_for_rundown_timeout(msecs_to_jiffies(TRANSPORT_WAIT_RUNDOWN_TIMEOUT_MSECS));
			if (!ok) {
				WPRINTF("Failed to wait for module rundown");
			}
			// It is absolutely crucial to call this after rundown protection!!!
			fsnotify_events_listener_deinit();
			task_info_maps_clear();
			release_file_modify_entry();
			IPRINTF("interceptors detached");
		}
	}
	mutex_unlock(&transport_global.transport_mutex);

	synchronize_rcu();
	transport_del(transport);

	return 0;
}

int transport_device_mmap(struct file *filp, struct vm_area_struct *vma)
{
	int ret;
	transport_t *transport = filp->private_data;
	if (READ_ONCE(transport->shutdown)) {
		ret = -EIO;
		goto out;
	}

	ret = transport_data_queue_mmap(transport, vma);

out:
	return ret;
}

static long wait_msg_killable_timeout(msg_t* msg, unsigned long timeout_jiffies)
{
#ifndef HAVE_WAIT_EVENT_KILLABLE_TIMEOUT
	// 'wait_event_interruptible_timeout' has to be a define and so is
	// 'TASK_KILLABLE' and 'TASK_INTERRUPTIBLE'.
	// I need functionality of 'wait_event_interruptible_timeout'
	// but 'TASK_INTERRUPTIBLE' replaced with 'TASK_KILLABLE' which
	// is achieved using the 'define' tricks by redefining 'TASK_INTERRUPTIBLE'.
	// If the trick won't work, using the regular 'wait_event_timeout'.
#if defined(TASK_KILLABLE) && defined(TASK_INTERRUPTIBLE) && defined(wait_event_interruptible_timeout) && !defined(signal_pending)
#undef TASK_INTERRUPTIBLE
#define TASK_INTERRUPTIBLE TASK_KILLABLE
#define signal_pending fatal_signal_pending
	return wait_event_interruptible_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#undef TASK_INTERRUPTIBLE
#undef signal_pending
#else
	// Something weird is going on, rollback to 'TASK_UNINTERRUPTIBLE' variant.
	// It should not cause any issues though as far as APL
	// daemon is responding to events so it is not bad.
	return wait_event_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#endif
#else
	// Just use the well defined macros available.
	return wait_event_killable_timeout(msg->wait_queue,
		!atomic_read(&msg->reply_wait_count),
		timeout_jiffies);
#endif
}

© 2025 Cubjrnet7