/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/

/***
  This file is part of systemd.

  Copyright 2012 Lennart Poettering

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <assert.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "util.h"

#include "mmap-cache.h"

#define WINDOW_SIZE (8ULL*1024ULL*1024ULL)

#define DEFAULT_WINDOWS_MAX 64
#define DEFAULT_FDS_MAX 32
#define DEFAULT_CONTEXTS_MAX 32

typedef struct Window {
        int fd;
        void *ptr;
        uint64_t offset;
        uint64_t size;

        unsigned n_ref;
        unsigned lru_prev;
        unsigned lru_next;

        unsigned by_fd_prev;
        unsigned by_fd_next;
} Window;

typedef struct FileDescriptor {
        int fd;
        unsigned windows;
} FileDescriptor;

struct MMapCache {
        unsigned n_ref;

        unsigned contexts_max;
        unsigned windows_max;
        unsigned fds_max;

        unsigned n_windows;
        unsigned n_fds;

        unsigned lru_first, lru_last;

        Window *windows;
        unsigned *by_context;
        FileDescriptor *by_fd;
};

static int mmap_cache_peek_fd_index(MMapCache *m, int fd, unsigned *fd_index);

static void mmap_cache_window_unmap(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;
        if (!v->ptr)
                return;

        munmap(v->ptr, v->size);
        v->ptr = NULL;
}

static void mmap_cache_window_add_lru(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;
        assert(v->n_ref == 0);

        if (m->lru_last != (unsigned) -1) {
                assert(m->windows[m->lru_last].lru_next == (unsigned) -1);
                m->windows[m->lru_last].lru_next = w;
        }

        v->lru_prev = m->lru_last;
        v->lru_next = (unsigned) -1;

        m->lru_last = w;
        if (m->lru_first == (unsigned) -1)
                m->lru_first = w;
}

static void mmap_cache_window_remove_lru(MMapCache *m, unsigned w) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);

        v = m->windows + w;

        if (v->lru_prev == (unsigned) -1) {
                assert(m->lru_first == w);
                m->lru_first = v->lru_next;
        } else {
                assert(m->windows[v->lru_prev].lru_next == w);
                m->windows[v->lru_prev].lru_next = v->lru_next;
        }

        if (v->lru_next == (unsigned) -1) {
                assert(m->lru_last == w);
                m->lru_last = v->lru_prev;
        } else {
                assert(m->windows[v->lru_next].lru_prev == w);
                m->windows[v->lru_next].lru_prev = v->lru_prev;
        }
}

static void mmap_cache_fd_add(MMapCache *m, unsigned fd_index, unsigned w) {
        Window *v;

        assert(m);
        assert(fd_index < m->n_fds);

        v = m->windows + w;
        assert(m->by_fd[fd_index].fd == v->fd);

        if (m->by_fd[fd_index].windows != (unsigned) -1) {
                assert(m->windows[m->by_fd[fd_index].windows].by_fd_prev == (unsigned) -1);
                m->windows[m->by_fd[fd_index].windows].by_fd_prev = w;
        }

        v->by_fd_next = m->by_fd[fd_index].windows;
        v->by_fd_prev = (unsigned) -1;

        m->by_fd[fd_index].windows = w;
}

static void mmap_cache_fd_remove(MMapCache *m, unsigned fd_index, unsigned w) {
        Window *v;

        assert(m);
        assert(fd_index < m->n_fds);

        v = m->windows + w;
        assert(m->by_fd[fd_index].fd == v->fd);
        assert(v->by_fd_next == (unsigned) -1 || m->windows[v->by_fd_next].fd == v->fd);
        assert(v->by_fd_prev == (unsigned) -1 || m->windows[v->by_fd_prev].fd == v->fd);

        if (v->by_fd_prev == (unsigned) -1) {
                assert(m->by_fd[fd_index].windows == w);
                m->by_fd[fd_index].windows = v->by_fd_next;
        } else {
                assert(m->windows[v->by_fd_prev].by_fd_next == w);
                m->windows[v->by_fd_prev].by_fd_next = v->by_fd_next;
        }

        if (v->by_fd_next != (unsigned) -1) {
                assert(m->windows[v->by_fd_next].by_fd_prev == w);
                m->windows[v->by_fd_next].by_fd_prev = v->by_fd_prev;
        }
}

static void mmap_cache_context_unset(MMapCache *m, unsigned c) {
        Window *v;
        unsigned w;

        assert(m);
        assert(c < m->contexts_max);

        if (m->by_context[c] == (unsigned) -1)
                return;

        w = m->by_context[c];
        m->by_context[c] = (unsigned) -1;

        v = m->windows + w;
        assert(v->n_ref > 0);
        v->n_ref --;

        if (v->n_ref == 0)
                mmap_cache_window_add_lru(m, w);
}

static void mmap_cache_context_set(MMapCache *m, unsigned c, unsigned w) {
        Window *v;

        assert(m);
        assert(c < m->contexts_max);
        assert(w < m->n_windows);

        if (m->by_context[c] == w)
                return;

        mmap_cache_context_unset(m, c);

        m->by_context[c] = w;

        v = m->windows + w;
        v->n_ref ++;

        if (v->n_ref == 1)
                mmap_cache_window_remove_lru(m, w);
}

static void mmap_cache_free(MMapCache *m) {

        assert(m);

        if (m->windows) {
                unsigned w;

                for (w = 0; w < m->n_windows; w++)
                        mmap_cache_window_unmap(m, w);

                free(m->windows);
        }

        free(m->by_context);
        free(m->by_fd);
        free(m);
}

MMapCache* mmap_cache_new(void) {
        MMapCache *m;

        m = new0(MMapCache, 1);
        if (!m)
                return NULL;

        m->contexts_max = DEFAULT_CONTEXTS_MAX;
        m->fds_max = DEFAULT_FDS_MAX;
        m->windows_max = DEFAULT_WINDOWS_MAX;
        m->n_ref = 1;
        m->lru_first = (unsigned) -1;
        m->lru_last = (unsigned) -1;

        m->windows = new(Window, m->windows_max);
        if (!m->windows) {
                mmap_cache_free(m);
                return NULL;
        }

        m->by_context = new(unsigned, m->contexts_max);
        if (!m->by_context) {
                mmap_cache_free(m);
                return NULL;
        }
        memset(m->by_context, -1, m->contexts_max * sizeof(unsigned));

        m->by_fd = new(FileDescriptor, m->fds_max);
        if (!m->by_fd) {
                mmap_cache_free(m);
                return NULL;
        }

        return m;
}

MMapCache* mmap_cache_ref(MMapCache *m) {
        assert(m);
        assert(m->n_ref > 0);

        m->n_ref++;
        return m;
}

MMapCache* mmap_cache_unref(MMapCache *m) {
        assert(m);
        assert(m->n_ref > 0);

        if (m->n_ref == 1)
                mmap_cache_free(m);
        else
                m->n_ref--;

        return NULL;
}

static int mmap_cache_allocate_window(MMapCache *m, unsigned *w) {
        Window *v;
        unsigned fd_index;

        assert(m);
        assert(w);

        if (m->n_windows < m->windows_max) {
                *w = m->n_windows ++;
                return 0;
        }

        if (m->lru_first == (unsigned) -1)
                return -E2BIG;

        *w = m->lru_first;
        v = m->windows + *w;
        assert(v->n_ref == 0);

        mmap_cache_window_unmap(m, *w);

        if (v->fd >= 0) {
                assert_se(mmap_cache_peek_fd_index(m, v->fd, &fd_index) > 0);
                mmap_cache_fd_remove(m, fd_index, *w);
        }

        mmap_cache_window_remove_lru(m, *w);

        return 0;
}

static int mmap_cache_make_room(MMapCache *m) {
        unsigned w;

        assert(m);

        w = m->lru_first;
        while (w != (unsigned) -1) {
                Window *v;

                v = m->windows + w;
                assert(v->n_ref == 0);

                if (v->ptr) {
                        mmap_cache_window_unmap(m, w);
                        return 1;
                }

                w = v->lru_next;
        }

        return 0;
}

static int mmap_cache_put(
                MMapCache *m,
                int fd,
                unsigned fd_index,
                int prot,
                unsigned context,
                bool keep_always,
                uint64_t offset,
                uint64_t size,
                struct stat *st,
                void **ret) {

        unsigned w;
        Window *v;
        void *d;
        uint64_t woffset, wsize;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        woffset = offset & ~((uint64_t) page_size() - 1ULL);
        wsize = size + (offset - woffset);
        wsize = PAGE_ALIGN(wsize);

        if (wsize < WINDOW_SIZE) {
                uint64_t delta;

                delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);

                if (delta > offset)
                        woffset = 0;
                else
                        woffset -= delta;

                wsize = WINDOW_SIZE;
        }

        if (st) {
                /* Memory maps that are larger then the files
                   underneath have undefined behavior. Hence, clamp
                   things to the file size if we know it */

                if (woffset >= (uint64_t) st->st_size)
                        return -EADDRNOTAVAIL;

                if (woffset + wsize > (uint64_t) st->st_size)
                        wsize = PAGE_ALIGN(st->st_size - woffset);
        }

        for (;;) {
                d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset);
                if (d != MAP_FAILED)
                        break;
                if (errno != ENOMEM)
                        return -errno;

                r = mmap_cache_make_room(m);
                if (r < 0)
                        return r;
                if (r == 0)
                        return -ENOMEM;
        }

        r = mmap_cache_allocate_window(m, &w);
        if (r < 0) {
                munmap(d, wsize);
                return r;
        }

        v = m->windows + w;
        v->fd = fd;
        v->ptr = d;
        v->offset = woffset;
        v->size = wsize;

        if (keep_always)
                v->n_ref = 1;
        else {
                v->n_ref = 0;
                mmap_cache_window_add_lru(m, w);
        }

        mmap_cache_fd_add(m, fd_index, w);
        mmap_cache_context_set(m, context, w);

        *ret = (uint8_t*) d + (offset - woffset);
        return 1;
}

static int fd_cmp(const void *_a, const void *_b) {
        const FileDescriptor *a = _a, *b = _b;

        if (a->fd < b->fd)
                return -1;
        if (a->fd > b->fd)
                return 1;

        return 0;
}

static int mmap_cache_peek_fd_index(MMapCache *m, int fd, unsigned *fd_index) {
        FileDescriptor *j;
        unsigned r;

        assert(m);
        assert(fd >= 0);
        assert(fd_index);

        for (r = 0; r < m->n_fds; r++)
                assert(m->by_fd[r].windows == (unsigned) -1 ||
                       m->windows[m->by_fd[r].windows].fd == m->by_fd[r].fd);

        j = bsearch(&fd, m->by_fd, m->n_fds, sizeof(FileDescriptor), fd_cmp);
        if (!j)
                return 0;

        *fd_index = (unsigned) (j - m->by_fd);
        return 1;
}

static int mmap_cache_get_fd_index(MMapCache *m, int fd, unsigned *fd_index) {
        FileDescriptor *j;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(fd_index);

        r = mmap_cache_peek_fd_index(m, fd, fd_index);
        if (r != 0)
                return r;

        if (m->n_fds >= m->fds_max) {
                unsigned k;
                FileDescriptor *n;

                k = m->n_fds * 2;
                n = realloc(m->by_fd, sizeof(FileDescriptor) * k);
                if (!n)
                        return -ENOMEM;

                m->fds_max = k;
                m->by_fd = n;
        }

        j = m->by_fd + m->n_fds ++;
        j->fd = fd;
        j->windows = (unsigned) -1;

        qsort(m->by_fd, m->n_fds, sizeof(FileDescriptor), fd_cmp);

        return mmap_cache_peek_fd_index(m, fd, fd_index);
}

static bool mmap_cache_test_window(
                MMapCache *m,
                unsigned w,
                uint64_t offset,
                uint64_t size) {
        Window *v;

        assert(m);
        assert(w < m->n_windows);
        assert(size > 0);

        v = m->windows + w;

        return offset >= v->offset &&
                offset + size <= v->offset + v->size;
}

static int mmap_cache_current(
                MMapCache *m,
                int fd,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        Window *v;
        unsigned w;

        assert(m);
        assert(fd >= 0);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        if (m->by_context[context] == (unsigned) -1)
                return 0;

        w = m->by_context[context];
        v = m->windows + w;

        if (v->fd != fd)
                return 0;

        if (!mmap_cache_test_window(m, w, offset, size))
                return 0;

        *ret = (uint8_t*) v->ptr + (offset - v->offset);
        return 1;
}

static int mmap_cache_find(
                MMapCache *m,
                int fd,
                unsigned fd_index,
                unsigned context,
                uint64_t offset,
                uint64_t size,
                void **ret) {

        Window *v = NULL;
        unsigned w;

        assert(m);
        assert(fd >= 0);
        assert(fd_index < m->n_fds);
        assert(context < m->contexts_max);
        assert(size > 0);
        assert(ret);

        w = m->by_fd[fd_index].windows;
        while (w != (unsigned) -1) {
                v = m->windows + w;
                assert(v->fd == fd);

                if (mmap_cache_test_window(m, w, offset, size))
                        break;

                w = v->by_fd_next;
        }

        if (w == (unsigned) -1)
                return 0;

        mmap_cache_context_set(m, context, w);

        *ret = (uint8_t*) v->ptr + (offset - v->offset);
        return 1;
}

int mmap_cache_get(
                MMapCache *m,
                int fd,
                int prot,
                unsigned context,
                bool keep_always,
                uint64_t offset,
                uint64_t size,
                struct stat *st,
                void **ret) {

        unsigned fd_index;
        int r;

        assert(m);
        assert(fd >= 0);
        assert(size > 0);
        assert(ret);

        if (context >= m->contexts_max) {
                unsigned k, *n;
                Window *w;

                /* Increase the number of contexts if necessary, and
                 * make sure we have twice the number of windows */

                k = context * 2;
                n = realloc(m->by_context, sizeof(unsigned) * k);
                if (!n)
                        return -ENOMEM;
                memset(n + m->contexts_max, -1, (k - m->contexts_max) * sizeof(unsigned));
                m->contexts_max = k;
                m->by_context = n;

                k = MAX(m->windows_max, m->contexts_max*2);
                w = realloc(m->windows, sizeof(Window) * k);
                if (!w)
                        return -ENOMEM;

                m->windows_max = k;
                m->windows = w;
        }

        /* Maybe the current pointer for this context is already the
         * right one? */
        r = mmap_cache_current(m, fd, context, offset, size, ret);
        if (r != 0)
                return r;

        /* Hmm, drop the reference to the current one, since it wasn't
         * good enough */
        mmap_cache_context_unset(m, context);

        /* OK, let's find the chain for this FD */
        r = mmap_cache_get_fd_index(m, fd, &fd_index);
        if (r < 0)
                return r;

        /* And let's look through the available mmaps */
        r = mmap_cache_find(m, fd, fd_index, context, offset, size, ret);
        if (r != 0)
                return r;

        /* Not found? Then, let's add it */
        return mmap_cache_put(m, fd, fd_index, prot, context, keep_always, offset, size, st, ret);
}

void mmap_cache_close_fd(MMapCache *m, int fd) {
        unsigned fd_index, c, w;
        int r;

        assert(m);
        assert(fd > 0);

        r = mmap_cache_peek_fd_index(m, fd, &fd_index);
        if (r <= 0)
                return;

        for (c = 0; c < m->contexts_max; c++) {
                w = m->by_context[c];
                if (w == (unsigned) -1)
                        continue;

                if (m->windows[w].fd == fd)
                        mmap_cache_context_unset(m, c);
        }

        w = m->by_fd[fd_index].windows;
        while (w != (unsigned) -1) {
                Window *v;

                v = m->windows + w;
                assert(v->fd == fd);

                mmap_cache_window_unmap(m, w);
                mmap_cache_fd_remove(m, fd_index, w);
                v->fd = -1;

                w = m->by_fd[fd_index].windows;
        }

        memmove(m->by_fd + fd_index, m->by_fd + fd_index + 1, (m->n_fds - (fd_index + 1)) * sizeof(FileDescriptor));
        m->n_fds --;
}

void mmap_cache_close_context(MMapCache *m, unsigned context) {
        mmap_cache_context_unset(m, context);
}