/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/

/***
  This file is part of systemd.

  Copyright 2012 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <assert.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "util.h"

#include "mmap-cache.h"

#define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
#define WINDOWS_MAX 32

typedef struct Window {
        int fd;
        void *ptr;
        uint64_t offset;
        uint64_t size;

        unsigned n_ref;
        unsigned lru_prev;
        unsigned lru_next;

        unsigned by_fd_prev;
        unsigned by_fd_next;
} Window;

typedef struct FileDescriptor {
        int fd;
        unsigned windows;
} FileDescriptor;

struct MMapCache {
        unsigned n_ref;

        unsigned contexts_max;
        unsigned windows_max;
        unsigned fds_max;

        unsigned n_windows;
        unsigned n_fds;

        unsigned lru_first, lru_last;

        Window *windows;
        unsigned *by_context;
        FileDescriptor *by_fd;
};

static int mmap_cache_peek_fd_index(MMapCache *m, int fd, unsigned *fd_index);

static void mmap_cache_window_unmap(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;
        if (!v->ptr)
                return;

        munmap(v->ptr, v->size);
        v->ptr = NULL;
}

static void mmap_cache_window_add_lru(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;
        assert(v->n_ref == 0);

        if (m->lru_last != (unsigned) -1) {
                assert(m->windows[m->lru_last].lru_next == (unsigned) -1);
                m->windows[m->lru_last].lru_next = w;
        }

        v->lru_prev = m->lru_last;
        v->lru_next = (unsigned) -1;

        m->lru_last = w;
        if (m->lru_first == (unsigned) -1)
                m->lru_first = w;
}

static void mmap_cache_window_remove_lru(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;

        if (v->lru_prev == (unsigned) -1) {
                assert(m->lru_first == w);
                m->lru_first = v->lru_next;
        } else {
                assert(m->windows[v->lru_prev].lru_next == w);
                m->windows[v->lru_prev].lru_next = v->lru_next;
        }

        if (v->lru_next == (unsigned) -1) {
                assert(m->lru_last == w);
                m->lru_last = v->lru_prev;
        } else {
                assert(m->windows[v->lru_next].lru_prev == w);
                m->windows[v->lru_next].lru_prev = v->lru_prev;
        }
}

static void mmap_cache_fd_add(MMapCache *m, unsigned fd_index, unsigned w) {
        Window *v;

        assert(m);
        assert(fd_index < m->n_fds);

        v = m->windows + w;
        assert(m->by_fd[fd_index].fd == v->fd);

        if (m->by_fd[fd_index].windows != (unsigned) -1) {
                assert(m->windows[m->by_fd[fd_index].windows].by_fd_prev == (unsigned) -1);
                m->windows[m->by_fd[fd_index].windows].by_fd_prev = w;
        }

        v->by_fd_next = m->by_fd[fd_index].windows;
        v->by_fd_prev = (unsigned) -1;

        m->by_fd[fd_index].windows = w;
}

static void mmap_cache_fd_remove(MMapCache *m, unsigned fd_index, unsigned w) {
        Window *v;

        assert(m);
        assert(fd_index < m->n_fds);

        v = m->windows + w;
        assert(m->by_fd[fd_index].fd == v->fd);
        assert(v->by_fd_next == (unsigned) -1 || m->windows[v->by_fd_next].fd == v->fd);
        assert(v->by_fd_prev == (unsigned) -1 || m->windows[v->by_fd_prev].fd == v->fd);

        if (v->by_fd_prev == (unsigned) -1) {
                assert(m->by_fd[fd_index].windows == w);
                m->by_fd[fd_index].windows = v->by_fd_next;
        } else {
                assert(m->windows[v->by_fd_prev].by_fd_next == w);
                m->windows[v->by_fd_prev].by_fd_next = v->by_fd_next;
        }

        if (v->by_fd_next != (unsigned) -1) {
                assert(m->windows[v->by_fd_next].by_fd_prev == w);
                m->windows[v->by_fd_next].by_fd_prev = v->by_fd_prev;
        }
}

static void mmap_cache_context_unset(MMapCache *m, unsigned c) {
        Window *v;
        unsigned w;

        assert(m);
        assert(c < m->contexts_max);

        if (m->by_context[c] == (unsigned) -1)
                return;

        w = m->by_context[c];
        m->by_context[c] = (unsigned) -1;

        v = m->windows + w;
        assert(v->n_ref > 0);
        v->n_ref --;

        if (v->n_ref == 0)
                mmap_cache_window_add_lru(m, w);
}

static void mmap_cache_context_set(MMapCache *m, unsigned c, unsigned w) {
        Window *v;

        assert(m);
        assert(c < m->contexts_max);
        assert(w < m->n_windows);

        if (m->by_context[c] == w)
                return;

        mmap_cache_context_unset(m, c);

        m->by_context[c] = w;

        v = m->windows + w;
        v->n_ref ++;

        if (v->n_ref == 1)
                mmap_cache_window_remove_lru(m, w);
}

static void mmap_cache_free(MMapCache *m) {

        assert(m);

        if (m->windows) {
                unsigned w;

                for (w = 0; w < m->n_windows; w++)
                        mmap_cache_window_unmap(m, w);

                free(m->windows);
        }

        free(m->by_context);
        free(m->by_fd);
        free(m);
}

MMapCache* mmap_cache_new(unsigned contexts_max, unsigned fds_max) {
        MMapCache *m;

        assert(contexts_max > 0);
        assert(fds_max > 0);

        m = new0(MMapCache, 1);
        if (!m)
                return NULL;

        m->contexts_max = contexts_max;
        m->fds_max = fds_max;
        m->windows_max = MAX(m->contexts_max, WINDOWS_MAX);
        m->n_ref = 1;
        m->lru_first = (unsigned) -1;
        m->lru_last = (unsigned) -1;

        m->windows = new(Window, m->windows_max);
        if (!m->windows) {
                mmap_cache_free(m);
                return NULL;
        }

        m->by_context = new(unsigned, m->contexts_max);
        if (!m->by_context) {
                mmap_cache_free(m);
                return NULL;
        }
        memset(m->by_context, -1, m->contexts_max * sizeof(unsigned));

        m->by_fd = new(FileDescriptor, m->fds_max);
        if (!m->by_fd) {
                mmap_cache_free(m);
                return NULL;
        }

        return m;
}

MMapCache* mmap_cache_ref(MMapCache *m) {
        assert(m);
        assert(m->n_ref > 0);

        m->n_ref++;
        return m;
}

MMapCache* mmap_cache_unref(MMapCache *m) {
        assert(m);
        assert(m->n_ref > 0);

        if (m->n_ref == 1)
                mmap_cache_free(m);
        else
                m->n_ref--;

        return NULL;
}

static int mmap_cache_allocate_window(MMapCache *m, unsigned *w) {
        Window *v;
        unsigned fd_index;

        assert(m);
        assert(w);

        if (m->n_windows < m->windows_max) {
                *w = m->n_windows ++;
                return 0;
        }

        if (m->lru_first == (unsigned) -1)
                return -E2BIG;

        *w = m->lru_first;
        v = m->windows + *w;
        assert(v->n_ref == 0);

        mmap_cache_window_unmap(m, *w);

        if (v->fd >= 0) {
                assert_se(mmap_cache_peek_fd_index(m, v->fd, &fd_index) > 0);
                mmap_cache_fd_remove(m, fd_index, *w);
        }

        mmap_cache_window_remove_lru(m, *w);

        return 0;
}

static int mmap_cache_make_room(MMapCache *m) {
        unsigned w;

        assert(m);

        w = m->lru_first;
        while (w != (unsigned) -1) {
                Window *v;

                v = m->windows + w;

                if (v->ptr) {
                        mmap_cache_window_unmap(m, w);
                        return 1;
                }

                w = v->lru_next;
        }

        return 0;
}

static int mmap_cache_put(
                MMapCache *m,
                int fd,
                unsigned fd_index,
                int prot,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        unsigned w;
        Window *v;
        void *d;
        uint64_t woffset, wsize;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        woffset = offset & ~((uint64_t) page_size() - 1ULL);
        wsize = size + (offset - woffset);
        wsize = PAGE_ALIGN(wsize);

        if (wsize < WINDOW_SIZE) {
                uint64_t delta;

                delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);

                if (delta > offset)
                        woffset = 0;
                else
                        woffset -= delta;

                wsize = WINDOW_SIZE;
        }

        for (;;) {
                d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset);
                if (d != MAP_FAILED)
                        break;
                if (errno != ENOMEM)
                        return -errno;

                r = mmap_cache_make_room(m);
                if (r < 0)
                        return r;
                if (r == 0)
                        return -ENOMEM;
        }

        r = mmap_cache_allocate_window(m, &w);
        if (r < 0) {
                munmap(d, wsize);
                return r;
        }

        v = m->windows + w;
        v->fd = fd;
        v->ptr = d;
        v->offset = woffset;
        v->size = wsize;

        v->n_ref = 0;
        mmap_cache_window_add_lru(m, w);
        mmap_cache_fd_add(m, fd_index, w);
        mmap_cache_context_set(m, context, w);

        *ret = (uint8_t*) d + (offset - woffset);
        return 1;
}

static int fd_cmp(const void *_a, const void *_b) {
        const FileDescriptor *a = _a, *b = _b;

        if (a->fd < b->fd)
                return -1;
        if (a->fd > b->fd)
                return 1;

        return 0;
}

static int mmap_cache_peek_fd_index(MMapCache *m, int fd, unsigned *fd_index) {
        FileDescriptor *j;
        unsigned r;

        assert(m);
        assert(fd >= 0);
        assert(fd_index);

        for (r = 0; r < m->n_fds; r++)
                assert(m->by_fd[r].windows == (unsigned) -1 ||
                       m->windows[m->by_fd[r].windows].fd == m->by_fd[r].fd);

        j = bsearch(&fd, m->by_fd, m->n_fds, sizeof(FileDescriptor), fd_cmp);
        if (!j)
                return 0;

        *fd_index = (unsigned) (j - m->by_fd);
        return 1;
}

static int mmap_cache_get_fd_index(MMapCache *m, int fd, unsigned *fd_index) {
        FileDescriptor *j;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(fd_index);

        r = mmap_cache_peek_fd_index(m, fd, fd_index);
        if (r != 0)
                return r;

        if (m->n_fds >= m->fds_max)
                return -E2BIG;

        j = m->by_fd + m->n_fds ++;
        j->fd = fd;
        j->windows = (unsigned) -1;

        qsort(m->by_fd, m->n_fds, sizeof(FileDescriptor), fd_cmp);

        return mmap_cache_peek_fd_index(m, fd, fd_index);
}

static bool mmap_cache_test_window(
                MMapCache *m,
                unsigned w,
                uint64_t offset,
                uint64_t size) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);
        assert(size > 0);

        v = m->windows + w;

        return offset >= v->offset &&
                offset + size <= v->offset + v->size;
}

static int mmap_cache_current(
                MMapCache *m,
                int fd,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        Window *v;
        unsigned w;

        assert(m);
        assert(fd >= 0);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        if (m->by_context[context] == (unsigned) -1)
                return 0;

        w = m->by_context[context];
        v = m->windows + w;

        if (v->fd != fd)
                return 0;

        if (!mmap_cache_test_window(m, w, offset, size))
                return 0;

        *ret = (uint8_t*) v->ptr + (offset - v->offset);
        return 1;
}

static int mmap_cache_find(
                MMapCache *m,
                int fd,
                unsigned fd_index,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        Window *v = NULL;
        unsigned w;

        assert(m);
        assert(fd >= 0);
        assert(fd_index < m->n_fds);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        w = m->by_fd[fd_index].windows;
        while (w != (unsigned) -1) {
                v = m->windows + w;
                assert(v->fd == fd);

                if (mmap_cache_test_window(m, w, offset, size))
                        break;

                w = v->by_fd_next;
        }

        if (w == (unsigned) -1)
                return 0;

        mmap_cache_context_set(m, context, w);

        *ret = (uint8_t*) v->ptr + (offset - v->offset);
        return 1;
}

int mmap_cache_get(
                MMapCache *m,
                int fd,
                int prot,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        unsigned fd_index;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        /* Maybe the current pointer for this context is already the
         * right one? */
        r = mmap_cache_current(m, fd, context, offset, size, ret);
        if (r != 0)
                return r;

        /* Hmm, drop the reference to the current one, since it wasn't
         * good enough */
        mmap_cache_context_unset(m, context);

        /* OK, let's find the chain for this FD */
        r = mmap_cache_get_fd_index(m, fd, &fd_index);
        if (r < 0)
                return r;

        /* And let's look through the available mmaps */
        r = mmap_cache_find(m, fd, fd_index, context, offset, size, ret);
        if (r != 0)
                return r;

        /* Not found? Then, let's add it */
        return mmap_cache_put(m, fd, fd_index, prot, context, offset, size, ret);
}

void mmap_cache_close_fd(MMapCache *m, int fd) {
        unsigned fd_index, c, w;
        int r;

        assert(m);
        assert(fd > 0);

        r = mmap_cache_peek_fd_index(m, fd, &fd_index);
        if (r <= 0)
                return;

        for (c = 0; c < m->contexts_max; c++) {
                w = m->by_context[c];
                if (w == (unsigned) -1)
                        continue;

                if (m->windows[w].fd == fd)
                        mmap_cache_context_unset(m, c);
        }

        w = m->by_fd[fd_index].windows;
        while (w != (unsigned) -1) {
                Window *v;

                v = m->windows + w;
                assert(v->fd == fd);

                mmap_cache_window_unmap(m, w);
                mmap_cache_fd_remove(m, fd_index, w);
                v->fd = -1;

                w = m->by_fd[fd_index].windows;
        }

        memmove(m->by_fd + fd_index, m->by_fd + fd_index + 1, (m->n_fds - (fd_index + 1)) * sizeof(FileDescriptor));
        m->n_fds --;
}

void mmap_cache_close_fd_range(MMapCache *m, int fd, uint64_t p) {
        unsigned fd_index, c, w;
        int r;

        assert(m);
        assert(fd > 0);

        /* This drops all windows that include space right of the
         * specified offset. This is useful to ensure that after the
         * file size is extended we drop our mappings of the end and
         * create it anew, since otherwise it is undefined whether
         * mapping will continue to work as intended. */

        r = mmap_cache_peek_fd_index(m, fd, &fd_index);
        if (r <= 0)
                return;

        for (c = 0; c < m->contexts_max; c++) {
                w = m->by_context[c];

                if (w != (unsigned) -1 && m->windows[w].fd == fd)
                        mmap_cache_context_unset(m, c);
        }

        w = m->by_fd[fd_index].windows;
        while (w != (unsigned) -1) {
                Window *v;

                v = m->windows + w;
                assert(v->fd == fd);
                assert(v->by_fd_next == (unsigned) -1 ||
                       m->windows[v->by_fd_next].fd == fd);

                if (v->offset + v->size > p) {

                        mmap_cache_window_unmap(m, w);
                        mmap_cache_fd_remove(m, fd_index, w);
                        v->fd = -1;

                        w = m->by_fd[fd_index].windows;
                } else
                        w = v->by_fd_next;
        }
}

void mmap_cache_close_context(MMapCache *m, unsigned context) {
        mmap_cache_context_unset(m, context);
}