diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-01-20 14:01:31 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-01-20 14:01:31 -0300 |
commit | b4b7ff4b08e691656c9d77c758fc355833128ac0 (patch) | |
tree | 82fcb00e6b918026dc9f2d1f05ed8eee83874cc0 /drivers/misc/mic/scif | |
parent | 35acfa0fc609f2a2cd95cef4a6a9c3a5c38f1778 (diff) |
Linux-libre 4.4-gnupck-4.4-gnu
Diffstat (limited to 'drivers/misc/mic/scif')
-rw-r--r-- | drivers/misc/mic/scif/Makefile | 5 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_api.c | 234 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_debugfs.c | 85 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_dma.c | 1979 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_epd.c | 26 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_epd.h | 50 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_fd.c | 178 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_fence.c | 771 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_main.c | 111 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_main.h | 37 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_map.h | 25 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_mmap.c | 699 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_nm.c | 20 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_nodeqp.c | 149 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_nodeqp.h | 42 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_peer_bus.c | 179 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_peer_bus.h | 42 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_rma.c | 1775 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_rma.h | 464 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_rma_list.c | 291 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_rma_list.h | 57 |
21 files changed, 6956 insertions, 263 deletions
diff --git a/drivers/misc/mic/scif/Makefile b/drivers/misc/mic/scif/Makefile index bf10bb7e2..29cfc3e51 100644 --- a/drivers/misc/mic/scif/Makefile +++ b/drivers/misc/mic/scif/Makefile @@ -13,3 +13,8 @@ scif-objs += scif_epd.o scif-objs += scif_rb.o scif-objs += scif_nodeqp.o scif-objs += scif_nm.o +scif-objs += scif_dma.o +scif-objs += scif_fence.o +scif-objs += scif_mmap.o +scif-objs += scif_rma.o +scif-objs += scif_rma_list.o diff --git a/drivers/misc/mic/scif/scif_api.c b/drivers/misc/mic/scif/scif_api.c index f39d3135a..ddc9e4b08 100644 --- a/drivers/misc/mic/scif/scif_api.c +++ b/drivers/misc/mic/scif/scif_api.c @@ -37,9 +37,21 @@ enum conn_async_state { ASYNC_CONN_FLUSH_WORK /* async work flush in progress */ }; +/* + * File operations for anonymous inode file associated with a SCIF endpoint, + * used in kernel mode SCIF poll. Kernel mode SCIF poll calls portions of the + * poll API in the kernel and these take in a struct file *. Since a struct + * file is not available to kernel mode SCIF, it uses an anonymous file for + * this purpose. + */ +const struct file_operations scif_anon_fops = { + .owner = THIS_MODULE, +}; + scif_epd_t scif_open(void) { struct scif_endpt *ep; + int err; might_sleep(); ep = kzalloc(sizeof(*ep), GFP_KERNEL); @@ -50,15 +62,22 @@ scif_epd_t scif_open(void) if (!ep->qp_info.qp) goto err_qp_alloc; + err = scif_anon_inode_getfile(ep); + if (err) + goto err_anon_inode; + spin_lock_init(&ep->lock); mutex_init(&ep->sendlock); mutex_init(&ep->recvlock); + scif_rma_ep_init(ep); ep->state = SCIFEP_UNBOUND; dev_dbg(scif_info.mdev.this_device, "SCIFAPI open: ep %p success\n", ep); return ep; +err_anon_inode: + kfree(ep->qp_info.qp); err_qp_alloc: kfree(ep); err_ep_alloc: @@ -166,8 +185,11 @@ int scif_close(scif_epd_t epd) switch (oldstate) { case SCIFEP_ZOMBIE: + dev_err(scif_info.mdev.this_device, + "SCIFAPI close: zombie state unexpected\n"); case SCIFEP_DISCONNECTED: spin_unlock(&ep->lock); + scif_unregister_all_windows(epd); /* Remove from the disconnected list */ mutex_lock(&scif_info.connlock); list_for_each_safe(pos, tmpq, &scif_info.disconnected) { @@ -189,6 +211,7 @@ int scif_close(scif_epd_t epd) case SCIFEP_CLOSING: { spin_unlock(&ep->lock); + scif_unregister_all_windows(epd); scif_disconnect_ep(ep); break; } @@ -200,7 +223,7 @@ int scif_close(scif_epd_t epd) struct scif_endpt *aep; spin_unlock(&ep->lock); - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); /* remove from listen list */ list_for_each_safe(pos, tmpq, &scif_info.listen) { @@ -222,7 +245,7 @@ int scif_close(scif_epd_t epd) break; } } - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); mutex_lock(&scif_info.connlock); list_for_each_safe(pos, tmpq, &scif_info.connected) { tmpep = list_entry(pos, @@ -242,13 +265,13 @@ int scif_close(scif_epd_t epd) } mutex_unlock(&scif_info.connlock); scif_teardown_ep(aep); - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); scif_add_epd_to_zombie_list(aep, SCIF_EPLOCK_HELD); ep->acceptcnt--; } spin_lock(&ep->lock); - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); /* Remove and reject any pending connection requests. */ while (ep->conreqcnt) { @@ -279,6 +302,7 @@ int scif_close(scif_epd_t epd) } } scif_put_port(ep->port.port); + scif_anon_inode_fput(ep); scif_teardown_ep(ep); scif_add_epd_to_zombie_list(ep, !SCIF_EPLOCK_HELD); return 0; @@ -409,9 +433,9 @@ int scif_listen(scif_epd_t epd, int backlog) scif_teardown_ep(ep); ep->qp_info.qp = NULL; - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); list_add_tail(&ep->list, &scif_info.listen); - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); return 0; } EXPORT_SYMBOL_GPL(scif_listen); @@ -450,6 +474,13 @@ static int scif_conn_func(struct scif_endpt *ep) struct scifmsg msg; struct device *spdev; + err = scif_reserve_dma_chan(ep); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + ep->state = SCIFEP_BOUND; + goto connect_error_simple; + } /* Initiate the first part of the endpoint QP setup */ err = scif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset, SCIF_ENDPT_QP_SIZE, ep->remote_dev); @@ -558,8 +589,10 @@ void scif_conn_handler(struct work_struct *work) list_del(&ep->conn_list); } spin_unlock(&scif_info.nb_connect_lock); - if (ep) + if (ep) { ep->conn_err = scif_conn_func(ep); + wake_up_interruptible(&ep->conn_pend_wq); + } } while (ep); } @@ -660,6 +693,7 @@ int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block) ep->remote_dev = &scif_dev[dst->node]; ep->qp_info.qp->magic = SCIFEP_MAGIC; if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + init_waitqueue_head(&ep->conn_pend_wq); spin_lock(&scif_info.nb_connect_lock); list_add_tail(&ep->conn_list, &scif_info.nb_connect_list); spin_unlock(&scif_info.nb_connect_lock); @@ -782,12 +816,25 @@ retry_connection: cep->remote_dev = &scif_dev[peer->node]; cep->remote_ep = conreq->msg.payload[0]; + scif_rma_ep_init(cep); + + err = scif_reserve_dma_chan(cep); + if (err) { + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + goto scif_accept_error_qpalloc; + } + cep->qp_info.qp = kzalloc(sizeof(*cep->qp_info.qp), GFP_KERNEL); if (!cep->qp_info.qp) { err = -ENOMEM; goto scif_accept_error_qpalloc; } + err = scif_anon_inode_getfile(cep); + if (err) + goto scif_accept_error_anon_inode; + cep->qp_info.qp->magic = SCIFEP_MAGIC; spdev = scif_get_peer_dev(cep->remote_dev); if (IS_ERR(spdev)) { @@ -858,6 +905,8 @@ retry: spin_unlock(&cep->lock); return 0; scif_accept_error_map: + scif_anon_inode_fput(cep); +scif_accept_error_anon_inode: scif_teardown_ep(cep); scif_accept_error_qpalloc: kfree(cep); @@ -1247,6 +1296,134 @@ int scif_recv(scif_epd_t epd, void *msg, int len, int flags) } EXPORT_SYMBOL_GPL(scif_recv); +static inline void _scif_poll_wait(struct file *f, wait_queue_head_t *wq, + poll_table *p, struct scif_endpt *ep) +{ + /* + * Because poll_wait makes a GFP_KERNEL allocation, give up the lock + * and regrab it afterwards. Because the endpoint state might have + * changed while the lock was given up, the state must be checked + * again after re-acquiring the lock. The code in __scif_pollfd(..) + * does this. + */ + spin_unlock(&ep->lock); + poll_wait(f, wq, p); + spin_lock(&ep->lock); +} + +unsigned int +__scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep) +{ + unsigned int mask = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]); + + spin_lock(&ep->lock); + + /* Endpoint is waiting for a non-blocking connect to complete */ + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + _scif_poll_wait(f, &ep->conn_pend_wq, wait, ep); + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + if (ep->state == SCIFEP_CONNECTED || + ep->state == SCIFEP_DISCONNECTED || + ep->conn_err) + mask |= POLLOUT; + goto exit; + } + } + + /* Endpoint is listening for incoming connection requests */ + if (ep->state == SCIFEP_LISTENING) { + _scif_poll_wait(f, &ep->conwq, wait, ep); + if (ep->state == SCIFEP_LISTENING) { + if (ep->conreqcnt) + mask |= POLLIN; + goto exit; + } + } + + /* Endpoint is connected or disconnected */ + if (ep->state == SCIFEP_CONNECTED || ep->state == SCIFEP_DISCONNECTED) { + if (poll_requested_events(wait) & POLLIN) + _scif_poll_wait(f, &ep->recvwq, wait, ep); + if (poll_requested_events(wait) & POLLOUT) + _scif_poll_wait(f, &ep->sendwq, wait, ep); + if (ep->state == SCIFEP_CONNECTED || + ep->state == SCIFEP_DISCONNECTED) { + /* Data can be read without blocking */ + if (scif_rb_count(&ep->qp_info.qp->inbound_q, 1)) + mask |= POLLIN; + /* Data can be written without blocking */ + if (scif_rb_space(&ep->qp_info.qp->outbound_q)) + mask |= POLLOUT; + /* Return POLLHUP if endpoint is disconnected */ + if (ep->state == SCIFEP_DISCONNECTED) + mask |= POLLHUP; + goto exit; + } + } + + /* Return POLLERR if the endpoint is in none of the above states */ + mask |= POLLERR; +exit: + spin_unlock(&ep->lock); + return mask; +} + +/** + * scif_poll() - Kernel mode SCIF poll + * @ufds: Array of scif_pollepd structures containing the end points + * and events to poll on + * @nfds: Size of the ufds array + * @timeout_msecs: Timeout in msecs, -ve implies infinite timeout + * + * The code flow in this function is based on do_poll(..) in select.c + * + * Returns the number of endpoints which have pending events or 0 in + * the event of a timeout. If a signal is used for wake up, -EINTR is + * returned. + */ +int +scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs) +{ + struct poll_wqueues table; + poll_table *pt; + int i, mask, count = 0, timed_out = timeout_msecs == 0; + u64 timeout = timeout_msecs < 0 ? MAX_SCHEDULE_TIMEOUT + : msecs_to_jiffies(timeout_msecs); + + poll_initwait(&table); + pt = &table.pt; + while (1) { + for (i = 0; i < nfds; i++) { + pt->_key = ufds[i].events | POLLERR | POLLHUP; + mask = __scif_pollfd(ufds[i].epd->anon, + pt, ufds[i].epd); + mask &= ufds[i].events | POLLERR | POLLHUP; + if (mask) { + count++; + pt->_qproc = NULL; + } + ufds[i].revents = mask; + } + pt->_qproc = NULL; + if (!count) { + count = table.error; + if (signal_pending(current)) + count = -EINTR; + } + if (count || timed_out) + break; + + if (!schedule_timeout_interruptible(timeout)) + timed_out = 1; + } + poll_freewait(&table); + return count; +} +EXPORT_SYMBOL_GPL(scif_poll); + int scif_get_node_ids(u16 *nodes, int len, u16 *self) { int online = 0; @@ -1274,3 +1451,46 @@ int scif_get_node_ids(u16 *nodes, int len, u16 *self) return online; } EXPORT_SYMBOL_GPL(scif_get_node_ids); + +static int scif_add_client_dev(struct device *dev, struct subsys_interface *si) +{ + struct scif_client *client = + container_of(si, struct scif_client, si); + struct scif_peer_dev *spdev = + container_of(dev, struct scif_peer_dev, dev); + + if (client->probe) + client->probe(spdev); + return 0; +} + +static void scif_remove_client_dev(struct device *dev, + struct subsys_interface *si) +{ + struct scif_client *client = + container_of(si, struct scif_client, si); + struct scif_peer_dev *spdev = + container_of(dev, struct scif_peer_dev, dev); + + if (client->remove) + client->remove(spdev); +} + +void scif_client_unregister(struct scif_client *client) +{ + subsys_interface_unregister(&client->si); +} +EXPORT_SYMBOL_GPL(scif_client_unregister); + +int scif_client_register(struct scif_client *client) +{ + struct subsys_interface *si = &client->si; + + si->name = client->name; + si->subsys = &scif_peer_bus; + si->add_dev = scif_add_client_dev; + si->remove_dev = scif_remove_client_dev; + + return subsys_interface_register(&client->si); +} +EXPORT_SYMBOL_GPL(scif_client_register); diff --git a/drivers/misc/mic/scif/scif_debugfs.c b/drivers/misc/mic/scif/scif_debugfs.c index 51f14e2a1..6884dad97 100644 --- a/drivers/misc/mic/scif/scif_debugfs.c +++ b/drivers/misc/mic/scif/scif_debugfs.c @@ -62,10 +62,87 @@ static const struct file_operations scif_dev_ops = { .release = scif_dev_test_release }; -void __init scif_init_debugfs(void) +static void scif_display_window(struct scif_window *window, struct seq_file *s) +{ + int j; + struct scatterlist *sg; + scif_pinned_pages_t pin = window->pinned_pages; + + seq_printf(s, "window %p type %d temp %d offset 0x%llx ", + window, window->type, window->temp, window->offset); + seq_printf(s, "nr_pages 0x%llx nr_contig_chunks 0x%x prot %d ", + window->nr_pages, window->nr_contig_chunks, window->prot); + seq_printf(s, "ref_count %d magic 0x%llx peer_window 0x%llx ", + window->ref_count, window->magic, window->peer_window); + seq_printf(s, "unreg_state 0x%x va_for_temp 0x%lx\n", + window->unreg_state, window->va_for_temp); + + for (j = 0; j < window->nr_contig_chunks; j++) + seq_printf(s, "page[%d] dma_addr 0x%llx num_pages 0x%llx\n", j, + window->dma_addr[j], window->num_pages[j]); + + if (window->type == SCIF_WINDOW_SELF && pin) + for (j = 0; j < window->nr_pages; j++) + seq_printf(s, "page[%d] = pinned_pages %p address %p\n", + j, pin->pages[j], + page_address(pin->pages[j])); + + if (window->st) + for_each_sg(window->st->sgl, sg, window->st->nents, j) + seq_printf(s, "sg[%d] dma addr 0x%llx length 0x%x\n", + j, sg_dma_address(sg), sg_dma_len(sg)); +} + +static void scif_display_all_windows(struct list_head *head, struct seq_file *s) { - struct dentry *d; + struct list_head *item; + struct scif_window *window; + list_for_each(item, head) { + window = list_entry(item, struct scif_window, list); + scif_display_window(window, s); + } +} + +static int scif_rma_test(struct seq_file *s, void *unused) +{ + struct scif_endpt *ep; + struct list_head *pos; + + mutex_lock(&scif_info.connlock); + list_for_each(pos, &scif_info.connected) { + ep = list_entry(pos, struct scif_endpt, list); + seq_printf(s, "ep %p self windows\n", ep); + mutex_lock(&ep->rma_info.rma_lock); + scif_display_all_windows(&ep->rma_info.reg_list, s); + seq_printf(s, "ep %p remote windows\n", ep); + scif_display_all_windows(&ep->rma_info.remote_reg_list, s); + mutex_unlock(&ep->rma_info.rma_lock); + } + mutex_unlock(&scif_info.connlock); + return 0; +} + +static int scif_rma_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_rma_test, inode->i_private); +} + +static int scif_rma_test_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static const struct file_operations scif_rma_ops = { + .owner = THIS_MODULE, + .open = scif_rma_test_open, + .read = seq_read, + .llseek = seq_lseek, + .release = scif_rma_test_release +}; + +void __init scif_init_debugfs(void) +{ scif_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); if (!scif_dbg) { dev_err(scif_info.mdev.this_device, @@ -73,8 +150,8 @@ void __init scif_init_debugfs(void) return; } - d = debugfs_create_file("scif_dev", 0444, scif_dbg, - NULL, &scif_dev_ops); + debugfs_create_file("scif_dev", 0444, scif_dbg, NULL, &scif_dev_ops); + debugfs_create_file("scif_rma", 0444, scif_dbg, NULL, &scif_rma_ops); debugfs_create_u8("en_msg_log", 0666, scif_dbg, &scif_info.en_msg_log); debugfs_create_u8("p2p_enable", 0666, scif_dbg, &scif_info.p2p_enable); } diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c new file mode 100644 index 000000000..95a13c629 --- /dev/null +++ b/drivers/misc/mic/scif/scif_dma.c @@ -0,0 +1,1979 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" +#include "scif_map.h" + +/* + * struct scif_dma_comp_cb - SCIF DMA completion callback + * + * @dma_completion_func: DMA completion callback + * @cb_cookie: DMA completion callback cookie + * @temp_buf: Temporary buffer + * @temp_buf_to_free: Temporary buffer to be freed + * @is_cache: Is a kmem_cache allocated buffer + * @dst_offset: Destination registration offset + * @dst_window: Destination registration window + * @len: Length of the temp buffer + * @temp_phys: DMA address of the temp buffer + * @sdev: The SCIF device + * @header_padding: padding for cache line alignment + */ +struct scif_dma_comp_cb { + void (*dma_completion_func)(void *cookie); + void *cb_cookie; + u8 *temp_buf; + u8 *temp_buf_to_free; + bool is_cache; + s64 dst_offset; + struct scif_window *dst_window; + size_t len; + dma_addr_t temp_phys; + struct scif_dev *sdev; + int header_padding; +}; + +/** + * struct scif_copy_work - Work for DMA copy + * + * @src_offset: Starting source offset + * @dst_offset: Starting destination offset + * @src_window: Starting src registered window + * @dst_window: Starting dst registered window + * @loopback: true if this is a loopback DMA transfer + * @len: Length of the transfer + * @comp_cb: DMA copy completion callback + * @remote_dev: The remote SCIF peer device + * @fence_type: polling or interrupt based + * @ordered: is this a tail byte ordered DMA transfer + */ +struct scif_copy_work { + s64 src_offset; + s64 dst_offset; + struct scif_window *src_window; + struct scif_window *dst_window; + int loopback; + size_t len; + struct scif_dma_comp_cb *comp_cb; + struct scif_dev *remote_dev; + int fence_type; + bool ordered; +}; + +#ifndef list_entry_next +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) +#endif + +/** + * scif_reserve_dma_chan: + * @ep: Endpoint Descriptor. + * + * This routine reserves a DMA channel for a particular + * endpoint. All DMA transfers for an endpoint are always + * programmed on the same DMA channel. + */ +int scif_reserve_dma_chan(struct scif_endpt *ep) +{ + int err = 0; + struct scif_dev *scifdev; + struct scif_hw_dev *sdev; + struct dma_chan *chan; + + /* Loopback DMAs are not supported on the management node */ + if (!scif_info.nodeid && scifdev_self(ep->remote_dev)) + return 0; + if (scif_info.nodeid) + scifdev = &scif_dev[0]; + else + scifdev = ep->remote_dev; + sdev = scifdev->sdev; + if (!sdev->num_dma_ch) + return -ENODEV; + chan = sdev->dma_ch[scifdev->dma_ch_idx]; + scifdev->dma_ch_idx = (scifdev->dma_ch_idx + 1) % sdev->num_dma_ch; + mutex_lock(&ep->rma_info.rma_lock); + ep->rma_info.dma_chan = chan; + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +#ifdef CONFIG_MMU_NOTIFIER +/** + * scif_rma_destroy_tcw: + * + * This routine destroys temporary cached windows + */ +static +void __scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, + struct scif_endpt *ep, + u64 start, u64 len) +{ + struct list_head *item, *tmp; + struct scif_window *window; + u64 start_va, end_va; + u64 end = start + len; + + if (end <= start) + return; + + list_for_each_safe(item, tmp, &mmn->tc_reg_list) { + window = list_entry(item, struct scif_window, list); + ep = (struct scif_endpt *)window->ep; + if (!len) + break; + start_va = window->va_for_temp; + end_va = start_va + (window->nr_pages << PAGE_SHIFT); + if (start < start_va && end <= start_va) + break; + if (start >= end_va) + continue; + __scif_rma_destroy_tcw_helper(window); + } +} + +static void scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, u64 start, u64 len) +{ + struct scif_endpt *ep = mmn->ep; + + spin_lock(&ep->rma_info.tc_lock); + __scif_rma_destroy_tcw(mmn, ep, start, len); + spin_unlock(&ep->rma_info.tc_lock); +} + +static void scif_rma_destroy_tcw_ep(struct scif_endpt *ep) +{ + struct list_head *item, *tmp; + struct scif_mmu_notif *mmn; + + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); + } +} + +static void __scif_rma_destroy_tcw_ep(struct scif_endpt *ep) +{ + struct list_head *item, *tmp; + struct scif_mmu_notif *mmn; + + spin_lock(&ep->rma_info.tc_lock); + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + __scif_rma_destroy_tcw(mmn, ep, 0, ULONG_MAX); + } + spin_unlock(&ep->rma_info.tc_lock); +} + +static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) +{ + if ((cur_bytes >> PAGE_SHIFT) > scif_info.rma_tc_limit) + return false; + if ((atomic_read(&ep->rma_info.tcw_total_pages) + + (cur_bytes >> PAGE_SHIFT)) > + scif_info.rma_tc_limit) { + dev_info(scif_info.mdev.this_device, + "%s %d total=%d, current=%zu reached max\n", + __func__, __LINE__, + atomic_read(&ep->rma_info.tcw_total_pages), + (1 + (cur_bytes >> PAGE_SHIFT))); + scif_rma_destroy_tcw_invalid(); + __scif_rma_destroy_tcw_ep(ep); + } + return true; +} + +static void scif_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); + schedule_work(&scif_info.misc_work); +} + +static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, address, PAGE_SIZE); +} + +static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, start, end - start); +} + +static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + /* + * Nothing to do here, everything needed was done in + * invalidate_range_start. + */ +} + +static const struct mmu_notifier_ops scif_mmu_notifier_ops = { + .release = scif_mmu_notifier_release, + .clear_flush_young = NULL, + .invalidate_page = scif_mmu_notifier_invalidate_page, + .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, + .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; + +static void scif_ep_unregister_mmu_notifier(struct scif_endpt *ep) +{ + struct scif_endpt_rma_info *rma = &ep->rma_info; + struct scif_mmu_notif *mmn = NULL; + struct list_head *item, *tmp; + + mutex_lock(&ep->rma_info.mmn_lock); + list_for_each_safe(item, tmp, &rma->mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm); + list_del(item); + kfree(mmn); + } + mutex_unlock(&ep->rma_info.mmn_lock); +} + +static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn, + struct mm_struct *mm, struct scif_endpt *ep) +{ + mmn->ep = ep; + mmn->mm = mm; + mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops; + INIT_LIST_HEAD(&mmn->list); + INIT_LIST_HEAD(&mmn->tc_reg_list); +} + +static struct scif_mmu_notif * +scif_find_mmu_notifier(struct mm_struct *mm, struct scif_endpt_rma_info *rma) +{ + struct scif_mmu_notif *mmn; + struct list_head *item; + + list_for_each(item, &rma->mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + if (mmn->mm == mm) + return mmn; + } + return NULL; +} + +static struct scif_mmu_notif * +scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) +{ + struct scif_mmu_notif *mmn + = kzalloc(sizeof(*mmn), GFP_KERNEL); + + if (!mmn) + return ERR_PTR(ENOMEM); + + scif_init_mmu_notifier(mmn, current->mm, ep); + if (mmu_notifier_register(&mmn->ep_mmu_notifier, + current->mm)) { + kfree(mmn); + return ERR_PTR(EBUSY); + } + list_add(&mmn->list, &ep->rma_info.mmn_list); + return mmn; +} + +/* + * Called from the misc thread to destroy temporary cached windows and + * unregister the MMU notifier for the SCIF endpoint. + */ +void scif_mmu_notif_handler(struct work_struct *work) +{ + struct list_head *pos, *tmpq; + struct scif_endpt *ep; +restart: + scif_rma_destroy_tcw_invalid(); + spin_lock(&scif_info.rmalock); + list_for_each_safe(pos, tmpq, &scif_info.mmu_notif_cleanup) { + ep = list_entry(pos, struct scif_endpt, mmu_list); + list_del(&ep->mmu_list); + spin_unlock(&scif_info.rmalock); + scif_rma_destroy_tcw_ep(ep); + scif_ep_unregister_mmu_notifier(ep); + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +static bool scif_is_set_reg_cache(int flags) +{ + return !!(flags & SCIF_RMA_USECACHE); +} +#else +static struct scif_mmu_notif * +scif_find_mmu_notifier(struct mm_struct *mm, + struct scif_endpt_rma_info *rma) +{ + return NULL; +} + +static struct scif_mmu_notif * +scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) +{ + return NULL; +} + +void scif_mmu_notif_handler(struct work_struct *work) +{ +} + +static bool scif_is_set_reg_cache(int flags) +{ + return false; +} + +static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) +{ + return false; +} +#endif + +/** + * scif_register_temp: + * @epd: End Point Descriptor. + * @addr: virtual address to/from which to copy + * @len: length of range to copy + * @out_offset: computed offset returned by reference. + * @out_window: allocated registered window returned by reference. + * + * Create a temporary registered window. The peer will not know about this + * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's. + */ +static int +scif_register_temp(scif_epd_t epd, unsigned long addr, size_t len, int prot, + off_t *out_offset, struct scif_window **out_window) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err; + scif_pinned_pages_t pinned_pages; + size_t aligned_len; + + aligned_len = ALIGN(len, PAGE_SIZE); + + err = __scif_pin_pages((void *)(addr & PAGE_MASK), + aligned_len, &prot, 0, &pinned_pages); + if (err) + return err; + + pinned_pages->prot = prot; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, 0, 0, + aligned_len >> PAGE_SHIFT, + (s64 *)out_offset); + if (err) + goto error_unpin; + + /* Allocate and prepare self registration window */ + *out_window = scif_create_window(ep, aligned_len >> PAGE_SHIFT, + *out_offset, true); + if (!*out_window) { + scif_free_window_offset(ep, NULL, *out_offset); + err = -ENOMEM; + goto error_unpin; + } + + (*out_window)->pinned_pages = pinned_pages; + (*out_window)->nr_pages = pinned_pages->nr_pages; + (*out_window)->prot = pinned_pages->prot; + + (*out_window)->va_for_temp = addr & PAGE_MASK; + err = scif_map_window(ep->remote_dev, *out_window); + if (err) { + /* Something went wrong! Rollback */ + scif_destroy_window(ep, *out_window); + *out_window = NULL; + } else { + *out_offset |= (addr - (*out_window)->va_for_temp); + } + return err; +error_unpin: + if (err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + scif_unpin_pages(pinned_pages); + return err; +} + +#define SCIF_DMA_TO (3 * HZ) + +/* + * scif_sync_dma - Program a DMA without an interrupt descriptor + * + * @dev - The address of the pointer to the device instance used + * for DMA registration. + * @chan - DMA channel to be used. + * @sync_wait: Wait for DMA to complete? + * + * Return 0 on success and -errno on error. + */ +static int scif_sync_dma(struct scif_hw_dev *sdev, struct dma_chan *chan, + bool sync_wait) +{ + int err = 0; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags flags = DMA_PREP_FENCE; + dma_cookie_t cookie; + struct dma_device *ddev; + + if (!chan) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + ddev = chan->device; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); + if (!tx) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + if (!sync_wait) { + dma_async_issue_pending(chan); + } else { + if (dma_sync_wait(chan, cookie) == DMA_COMPLETE) { + err = 0; + } else { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + } + } +release: + return err; +} + +static void scif_dma_callback(void *arg) +{ + struct completion *done = (struct completion *)arg; + + complete(done); +} + +#define SCIF_DMA_SYNC_WAIT true +#define SCIF_DMA_POLL BIT(0) +#define SCIF_DMA_INTR BIT(1) + +/* + * scif_async_dma - Program a DMA with an interrupt descriptor + * + * @dev - The address of the pointer to the device instance used + * for DMA registration. + * @chan - DMA channel to be used. + * Return 0 on success and -errno on error. + */ +static int scif_async_dma(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + int err = 0; + struct dma_device *ddev; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags flags = DMA_PREP_INTERRUPT | DMA_PREP_FENCE; + DECLARE_COMPLETION_ONSTACK(done_wait); + dma_cookie_t cookie; + enum dma_status status; + + if (!chan) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + ddev = chan->device; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); + if (!tx) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + reinit_completion(&done_wait); + tx->callback = scif_dma_callback; + tx->callback_param = &done_wait; + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + dma_async_issue_pending(chan); + + err = wait_for_completion_timeout(&done_wait, SCIF_DMA_TO); + if (!err) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + err = 0; + status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); + if (status != DMA_COMPLETE) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } +release: + return err; +} + +/* + * scif_drain_dma_poll - Drain all outstanding DMA operations for a particular + * DMA channel via polling. + * + * @sdev - The SCIF device + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +static int scif_drain_dma_poll(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + if (!chan) + return -EINVAL; + return scif_sync_dma(sdev, chan, SCIF_DMA_SYNC_WAIT); +} + +/* + * scif_drain_dma_intr - Drain all outstanding DMA operations for a particular + * DMA channel via interrupt based blocking wait. + * + * @sdev - The SCIF device + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + if (!chan) + return -EINVAL; + return scif_async_dma(sdev, chan); +} + +/** + * scif_rma_destroy_windows: + * + * This routine destroys all windows queued for cleanup + */ +void scif_rma_destroy_windows(void) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep; + struct dma_chan *chan; + + might_sleep(); +restart: + spin_lock(&scif_info.rmalock); + list_for_each_safe(item, tmp, &scif_info.rma) { + window = list_entry(item, struct scif_window, + list); + ep = (struct scif_endpt *)window->ep; + chan = ep->rma_info.dma_chan; + + list_del_init(&window->list); + spin_unlock(&scif_info.rmalock); + if (!chan || !scifdev_alive(ep) || + !scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan)) + /* Remove window from global list */ + window->unreg_state = OP_COMPLETED; + else + dev_warn(&ep->remote_dev->sdev->dev, + "DMA engine hung?\n"); + if (window->unreg_state == OP_COMPLETED) { + if (window->type == SCIF_WINDOW_SELF) + scif_destroy_window(ep, window); + else + scif_destroy_remote_window(window); + atomic_dec(&ep->rma_info.tw_refcount); + } + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +/** + * scif_rma_destroy_tcw: + * + * This routine destroys temporary cached registered windows + * which have been queued for cleanup. + */ +void scif_rma_destroy_tcw_invalid(void) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep; + struct dma_chan *chan; + + might_sleep(); +restart: + spin_lock(&scif_info.rmalock); + list_for_each_safe(item, tmp, &scif_info.rma_tc) { + window = list_entry(item, struct scif_window, list); + ep = (struct scif_endpt *)window->ep; + chan = ep->rma_info.dma_chan; + list_del_init(&window->list); + spin_unlock(&scif_info.rmalock); + mutex_lock(&ep->rma_info.rma_lock); + if (!chan || !scifdev_alive(ep) || + !scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan)) { + atomic_sub(window->nr_pages, + &ep->rma_info.tcw_total_pages); + scif_destroy_window(ep, window); + atomic_dec(&ep->rma_info.tcw_refcount); + } else { + dev_warn(&ep->remote_dev->sdev->dev, + "DMA engine hung?\n"); + } + mutex_unlock(&ep->rma_info.rma_lock); + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +static inline +void *_get_local_va(off_t off, struct scif_window *window, size_t len) +{ + int page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + void *va = NULL; + + if (window->type == SCIF_WINDOW_SELF) { + struct page **pages = window->pinned_pages->pages; + + va = page_address(pages[page_nr]) + page_off; + } + return va; +} + +static inline +void *ioremap_remote(off_t off, struct scif_window *window, + size_t len, struct scif_dev *dev, + struct scif_window_iter *iter) +{ + dma_addr_t phys = scif_off_to_dma_addr(window, off, NULL, iter); + + /* + * If the DMA address is not card relative then we need the DMA + * addresses to be an offset into the bar. The aperture base was already + * added so subtract it here since scif_ioremap is going to add it again + */ + if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && + dev->sdev->aper && !dev->sdev->card_rel_da) + phys = phys - dev->sdev->aper->pa; + return scif_ioremap(phys, len, dev); +} + +static inline void +iounmap_remote(void *virt, size_t size, struct scif_copy_work *work) +{ + scif_iounmap(virt, size, work->remote_dev); +} + +/* + * Takes care of ordering issue caused by + * 1. Hardware: Only in the case of cpu copy from mgmt node to card + * because of WC memory. + * 2. Software: If memcpy reorders copy instructions for optimization. + * This could happen at both mgmt node and card. + */ +static inline void +scif_ordered_memcpy_toio(char *dst, const char *src, size_t count) +{ + if (!count) + return; + + memcpy_toio((void __iomem __force *)dst, src, --count); + /* Order the last byte with the previous stores */ + wmb(); + *(dst + count) = *(src + count); +} + +static inline void scif_unaligned_cpy_toio(char *dst, const char *src, + size_t count, bool ordered) +{ + if (ordered) + scif_ordered_memcpy_toio(dst, src, count); + else + memcpy_toio((void __iomem __force *)dst, src, count); +} + +static inline +void scif_ordered_memcpy_fromio(char *dst, const char *src, size_t count) +{ + if (!count) + return; + + memcpy_fromio(dst, (void __iomem __force *)src, --count); + /* Order the last byte with the previous loads */ + rmb(); + *(dst + count) = *(src + count); +} + +static inline void scif_unaligned_cpy_fromio(char *dst, const char *src, + size_t count, bool ordered) +{ + if (ordered) + scif_ordered_memcpy_fromio(dst, src, count); + else + memcpy_fromio(dst, (void __iomem __force *)src, count); +} + +#define SCIF_RMA_ERROR_CODE (~(dma_addr_t)0x0) + +/* + * scif_off_to_dma_addr: + * Obtain the dma_addr given the window and the offset. + * @window: Registered window. + * @off: Window offset. + * @nr_bytes: Return the number of contiguous bytes till next DMA addr index. + * @index: Return the index of the dma_addr array found. + * @start_off: start offset of index of the dma addr array found. + * The nr_bytes provides the callee an estimate of the maximum possible + * DMA xfer possible while the index/start_off provide faster lookups + * for the next iteration. + */ +dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, + size_t *nr_bytes, struct scif_window_iter *iter) +{ + int i, page_nr; + s64 start, end; + off_t page_off; + + if (window->nr_pages == window->nr_contig_chunks) { + page_nr = (off - window->offset) >> PAGE_SHIFT; + page_off = off & ~PAGE_MASK; + + if (nr_bytes) + *nr_bytes = PAGE_SIZE - page_off; + return window->dma_addr[page_nr] | page_off; + } + if (iter) { + i = iter->index; + start = iter->offset; + } else { + i = 0; + start = window->offset; + } + for (; i < window->nr_contig_chunks; i++) { + end = start + (window->num_pages[i] << PAGE_SHIFT); + if (off >= start && off < end) { + if (iter) { + iter->index = i; + iter->offset = start; + } + if (nr_bytes) + *nr_bytes = end - off; + return (window->dma_addr[i] + (off - start)); + } + start += (window->num_pages[i] << PAGE_SHIFT); + } + dev_err(scif_info.mdev.this_device, + "%s %d BUG. Addr not found? window %p off 0x%llx\n", + __func__, __LINE__, window, off); + return SCIF_RMA_ERROR_CODE; +} + +/* + * Copy between rma window and temporary buffer + */ +static void scif_rma_local_cpu_copy(s64 offset, struct scif_window *window, + u8 *temp, size_t rem_len, bool to_temp) +{ + void *window_virt; + size_t loop_len; + int offset_in_page; + s64 end_offset; + + offset_in_page = offset & ~PAGE_MASK; + loop_len = PAGE_SIZE - offset_in_page; + + if (rem_len < loop_len) + loop_len = rem_len; + + window_virt = _get_local_va(offset, window, loop_len); + if (!window_virt) + return; + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + + offset += loop_len; + temp += loop_len; + rem_len -= loop_len; + + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + while (rem_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + loop_len = min(PAGE_SIZE, rem_len); + window_virt = _get_local_va(offset, window, loop_len); + if (!window_virt) + return; + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + offset += loop_len; + temp += loop_len; + rem_len -= loop_len; + } +} + +/** + * scif_rma_completion_cb: + * @data: RMA cookie + * + * RMA interrupt completion callback. + */ +static void scif_rma_completion_cb(void *data) +{ + struct scif_dma_comp_cb *comp_cb = data; + + /* Free DMA Completion CB. */ + if (comp_cb->dst_window) + scif_rma_local_cpu_copy(comp_cb->dst_offset, + comp_cb->dst_window, + comp_cb->temp_buf + + comp_cb->header_padding, + comp_cb->len, false); + scif_unmap_single(comp_cb->temp_phys, comp_cb->sdev, + SCIF_KMEM_UNALIGNED_BUF_SIZE); + if (comp_cb->is_cache) + kmem_cache_free(unaligned_cache, + comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); +} + +/* Copies between temporary buffer and offsets provided in work */ +static int +scif_rma_list_dma_copy_unaligned(struct scif_copy_work *work, + u8 *temp, struct dma_chan *chan, + bool src_local) +{ + struct scif_dma_comp_cb *comp_cb = work->comp_cb; + dma_addr_t window_dma_addr, temp_dma_addr; + dma_addr_t temp_phys = comp_cb->temp_phys; + size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len; + int offset_in_ca, ret = 0; + s64 end_offset, offset; + struct scif_window *window; + void *window_virt_addr; + size_t tail_len; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + if (src_local) { + offset = work->dst_offset; + window = work->dst_window; + } else { + offset = work->src_offset; + window = work->src_window; + } + + offset_in_ca = offset & (L1_CACHE_BYTES - 1); + if (offset_in_ca) { + loop_len = L1_CACHE_BYTES - offset_in_ca; + loop_len = min(loop_len, remaining_len); + window_virt_addr = ioremap_remote(offset, window, + loop_len, + work->remote_dev, + NULL); + if (!window_virt_addr) + return -ENOMEM; + if (src_local) + scif_unaligned_cpy_toio(window_virt_addr, temp, + loop_len, + work->ordered && + !(remaining_len - loop_len)); + else + scif_unaligned_cpy_fromio(temp, window_virt_addr, + loop_len, work->ordered && + !(remaining_len - loop_len)); + iounmap_remote(window_virt_addr, loop_len, work); + + offset += loop_len; + temp += loop_len; + temp_phys += loop_len; + remaining_len -= loop_len; + } + + offset_in_ca = offset & ~PAGE_MASK; + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + if (scif_is_mgmt_node()) + temp_dma_addr = temp_phys; + else + /* Fix if we ever enable IOMMU on the card */ + temp_dma_addr = (dma_addr_t)virt_to_phys(temp); + window_dma_addr = scif_off_to_dma_addr(window, offset, + &nr_contig_bytes, + NULL); + loop_len = min(nr_contig_bytes, remaining_len); + if (src_local) { + if (work->ordered && !tail_len && + !(remaining_len - loop_len) && + loop_len != L1_CACHE_BYTES) { + /* + * Break up the last chunk of the transfer into + * two steps. if there is no tail to guarantee + * DMA ordering. SCIF_DMA_POLLING inserts + * a status update descriptor in step 1 which + * acts as a double sided synchronization fence + * for the DMA engine to ensure that the last + * cache line in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len - + L1_CACHE_BYTES, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + offset += (loop_len - L1_CACHE_BYTES); + temp_dma_addr += (loop_len - L1_CACHE_BYTES); + window_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + } else { + tx = dev->device_prep_dma_memcpy(chan, temp_dma_addr, + window_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + if (ret < 0) + goto err; + offset += loop_len; + temp += loop_len; + temp_phys += loop_len; + remaining_len -= loop_len; + offset_in_ca = 0; + } + if (tail_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + window_virt_addr = ioremap_remote(offset, window, tail_len, + work->remote_dev, + NULL); + if (!window_virt_addr) + return -ENOMEM; + /* + * The CPU copy for the tail bytes must be initiated only once + * previous DMA transfers for this endpoint have completed + * to guarantee ordering. + */ + if (work->ordered) { + struct scif_dev *rdev = work->remote_dev; + + ret = scif_drain_dma_intr(rdev->sdev, chan); + if (ret) + return ret; + } + if (src_local) + scif_unaligned_cpy_toio(window_virt_addr, temp, + tail_len, work->ordered); + else + scif_unaligned_cpy_fromio(temp, window_virt_addr, + tail_len, work->ordered); + iounmap_remote(window_virt_addr, tail_len, work); + } + tx = dev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_INTERRUPT); + if (!tx) { + ret = -ENOMEM; + return ret; + } + tx->callback = &scif_rma_completion_cb; + tx->callback_param = comp_cb; + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + return ret; + } + dma_async_issue_pending(chan); + return 0; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * _scif_rma_list_dma_copy_aligned: + * + * Traverse all the windows and perform DMA copy. + */ +static int _scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, + struct dma_chan *chan) +{ + dma_addr_t src_dma_addr, dst_dma_addr; + size_t loop_len, remaining_len, src_contig_bytes = 0; + size_t dst_contig_bytes = 0; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + s64 end_src_offset, end_dst_offset; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + int ret = 0; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + remaining_len = work->len; + + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + while (remaining_len) { + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(src_window, &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(dst_window, &dst_win_iter); + } + + /* compute dma addresses for transfer */ + src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, + &src_contig_bytes, + &src_win_iter); + dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, + &dst_contig_bytes, + &dst_win_iter); + loop_len = min(src_contig_bytes, dst_contig_bytes); + loop_len = min(loop_len, remaining_len); + if (work->ordered && !(remaining_len - loop_len)) { + /* + * Break up the last chunk of the transfer into two + * steps to ensure that the last byte in step 2 is + * updated last. + */ + /* Step 1) DMA: Body Length - 1 */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len - 1, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + src_offset += (loop_len - 1); + dst_offset += (loop_len - 1); + src_dma_addr += (loop_len - 1); + dst_dma_addr += (loop_len - 1); + remaining_len -= (loop_len - 1); + loop_len = remaining_len; + + /* Step 2) DMA: 1 BYTES */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + } + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + return ret; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * scif_rma_list_dma_copy_aligned: + * + * Traverse all the windows and perform DMA copy. + */ +static int scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, + struct dma_chan *chan) +{ + dma_addr_t src_dma_addr, dst_dma_addr; + size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0; + size_t dst_contig_bytes = 0; + int src_cache_off; + s64 end_src_offset, end_dst_offset; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + void *src_virt, *dst_virt; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + int ret = 0; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + remaining_len = work->len; + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + if (src_cache_off != 0) { + /* Head */ + loop_len = L1_CACHE_BYTES - src_cache_off; + loop_len = min(loop_len, remaining_len); + src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, NULL); + if (!src_virt) + return -ENOMEM; + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, NULL); + if (!dst_virt) { + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + if (src_window->type == SCIF_WINDOW_SELF) + scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, + remaining_len == loop_len ? + work->ordered : false); + else + scif_unaligned_cpy_fromio(dst_virt, src_virt, loop_len, + remaining_len == loop_len ? + work->ordered : false); + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + if (dst_window->type != SCIF_WINDOW_SELF) + iounmap_remote(dst_virt, loop_len, work); + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(src_window, &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(dst_window, &dst_win_iter); + } + + /* compute dma addresses for transfer */ + src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, + &src_contig_bytes, + &src_win_iter); + dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, + &dst_contig_bytes, + &dst_win_iter); + loop_len = min(src_contig_bytes, dst_contig_bytes); + loop_len = min(loop_len, remaining_len); + if (work->ordered && !tail_len && + !(remaining_len - loop_len)) { + /* + * Break up the last chunk of the transfer into two + * steps. if there is no tail to gurantee DMA ordering. + * Passing SCIF_DMA_POLLING inserts a status update + * descriptor in step 1 which acts as a double sided + * synchronization fence for the DMA engine to ensure + * that the last cache line in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len - + L1_CACHE_BYTES, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + src_offset += (loop_len - L1_CACHE_BYTES); + dst_offset += (loop_len - L1_CACHE_BYTES); + src_dma_addr += (loop_len - L1_CACHE_BYTES); + dst_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + remaining_len = tail_len; + if (remaining_len) { + loop_len = remaining_len; + if (src_offset == end_src_offset) + src_window = list_entry_next(src_window, list); + if (dst_offset == end_dst_offset) + dst_window = list_entry_next(dst_window, list); + + src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); + /* + * The CPU copy for the tail bytes must be initiated only once + * previous DMA transfers for this endpoint have completed to + * guarantee ordering. + */ + if (work->ordered) { + struct scif_dev *rdev = work->remote_dev; + + ret = scif_drain_dma_poll(rdev->sdev, chan); + if (ret) + return ret; + } + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, NULL); + if (!src_virt) + return -ENOMEM; + + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, NULL); + if (!dst_virt) { + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + + if (src_window->type == SCIF_WINDOW_SELF) + scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, + work->ordered); + else + scif_unaligned_cpy_fromio(dst_virt, src_virt, + loop_len, work->ordered); + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + + if (dst_window->type != SCIF_WINDOW_SELF) + iounmap_remote(dst_virt, loop_len, work); + remaining_len -= loop_len; + } + return ret; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * scif_rma_list_cpu_copy: + * + * Traverse all the windows and perform CPU copy. + */ +static int scif_rma_list_cpu_copy(struct scif_copy_work *work) +{ + void *src_virt, *dst_virt; + size_t loop_len, remaining_len; + int src_page_off, dst_page_off; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 end_src_offset, end_dst_offset; + int ret = 0; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + + remaining_len = work->len; + + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + while (remaining_len) { + src_page_off = src_offset & ~PAGE_MASK; + dst_page_off = dst_offset & ~PAGE_MASK; + loop_len = min(PAGE_SIZE - + max(src_page_off, dst_page_off), + remaining_len); + + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, + &src_win_iter); + if (!src_virt) { + ret = -ENOMEM; + goto error; + } + + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, + &dst_win_iter); + if (!dst_virt) { + if (src_window->type == SCIF_WINDOW_PEER) + iounmap_remote(src_virt, loop_len, work); + ret = -ENOMEM; + goto error; + } + + if (work->loopback) { + memcpy(dst_virt, src_virt, loop_len); + } else { + if (src_window->type == SCIF_WINDOW_SELF) + memcpy_toio((void __iomem __force *)dst_virt, + src_virt, loop_len); + else + memcpy_fromio(dst_virt, + (void __iomem __force *)src_virt, + loop_len); + } + if (src_window->type == SCIF_WINDOW_PEER) + iounmap_remote(src_virt, loop_len, work); + + if (dst_window->type == SCIF_WINDOW_PEER) + iounmap_remote(dst_virt, loop_len, work); + + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + if (remaining_len) { + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + scif_init_window_iter(src_window, + &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + scif_init_window_iter(dst_window, + &dst_win_iter); + } + } + } +error: + return ret; +} + +static int scif_rma_list_dma_copy_wrapper(struct scif_endpt *epd, + struct scif_copy_work *work, + struct dma_chan *chan, off_t loffset) +{ + int src_cache_off, dst_cache_off; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + u8 *temp = NULL; + bool src_local = true, dst_local = false; + struct scif_dma_comp_cb *comp_cb; + dma_addr_t src_dma_addr, dst_dma_addr; + int err; + + if (is_dma_copy_aligned(chan->device, 1, 1, 1)) + return _scif_rma_list_dma_copy_aligned(work, chan); + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1); + + if (dst_cache_off == src_cache_off) + return scif_rma_list_dma_copy_aligned(work, chan); + + if (work->loopback) + return scif_rma_list_cpu_copy(work); + src_dma_addr = __scif_off_to_dma_addr(work->src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(work->dst_window, dst_offset); + src_local = work->src_window->type == SCIF_WINDOW_SELF; + dst_local = work->dst_window->type == SCIF_WINDOW_SELF; + + dst_local = dst_local; + /* Allocate dma_completion cb */ + comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL); + if (!comp_cb) + goto error; + + work->comp_cb = comp_cb; + comp_cb->cb_cookie = comp_cb; + comp_cb->dma_completion_func = &scif_rma_completion_cb; + + if (work->len + (L1_CACHE_BYTES << 1) < SCIF_KMEM_UNALIGNED_BUF_SIZE) { + comp_cb->is_cache = false; + /* Allocate padding bytes to align to a cache line */ + temp = kmalloc(work->len + (L1_CACHE_BYTES << 1), + GFP_KERNEL); + if (!temp) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + /* kmalloc(..) does not guarantee cache line alignment */ + if (!IS_ALIGNED((u64)temp, L1_CACHE_BYTES)) + temp = PTR_ALIGN(temp, L1_CACHE_BYTES); + } else { + comp_cb->is_cache = true; + temp = kmem_cache_alloc(unaligned_cache, GFP_KERNEL); + if (!temp) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + } + + if (src_local) { + temp += dst_cache_off; + scif_rma_local_cpu_copy(work->src_offset, work->src_window, + temp, work->len, true); + } else { + comp_cb->dst_window = work->dst_window; + comp_cb->dst_offset = work->dst_offset; + work->src_offset = work->src_offset - src_cache_off; + comp_cb->len = work->len; + work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES); + comp_cb->header_padding = src_cache_off; + } + comp_cb->temp_buf = temp; + + err = scif_map_single(&comp_cb->temp_phys, temp, + work->remote_dev, SCIF_KMEM_UNALIGNED_BUF_SIZE); + if (err) + goto free_temp_buf; + comp_cb->sdev = work->remote_dev; + if (scif_rma_list_dma_copy_unaligned(work, temp, chan, src_local) < 0) + goto free_temp_buf; + if (!src_local) + work->fence_type = SCIF_DMA_INTR; + return 0; +free_temp_buf: + if (comp_cb->is_cache) + kmem_cache_free(unaligned_cache, comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); +free_comp_cb: + kfree(comp_cb); +error: + return -ENOMEM; +} + +/** + * scif_rma_copy: + * @epd: end point descriptor. + * @loffset: offset in local registered address space to/from which to copy + * @addr: user virtual address to/from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to/from which to copy + * @flags: flags + * @dir: LOCAL->REMOTE or vice versa. + * @last_chunk: true if this is the last chunk of a larger transfer + * + * Validate parameters, check if src/dst registered ranges requested for copy + * are valid and initiate either CPU or DMA copy. + */ +static int scif_rma_copy(scif_epd_t epd, off_t loffset, unsigned long addr, + size_t len, off_t roffset, int flags, + enum scif_rma_dir dir, bool last_chunk) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_rma_req remote_req; + struct scif_rma_req req; + struct scif_window *local_window = NULL; + struct scif_window *remote_window = NULL; + struct scif_copy_work copy_work; + bool loopback; + int err = 0; + struct dma_chan *chan; + struct scif_mmu_notif *mmn = NULL; + bool cache = false; + struct device *spdev; + + err = scif_verify_epd(ep); + if (err) + return err; + + if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | + SCIF_RMA_SYNC | SCIF_RMA_ORDERED))) + return -EINVAL; + + loopback = scifdev_self(ep->remote_dev) ? true : false; + copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? + SCIF_DMA_POLL : 0; + copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk); + + /* Use CPU for Mgmt node <-> Mgmt node copies */ + if (loopback && scif_is_mgmt_node()) { + flags |= SCIF_RMA_USECPU; + copy_work.fence_type = 0x0; + } + + cache = scif_is_set_reg_cache(flags); + + remote_req.out_window = &remote_window; + remote_req.offset = roffset; + remote_req.nr_bytes = len; + /* + * If transfer is from local to remote then the remote window + * must be writeable and vice versa. + */ + remote_req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_WRITE : VM_READ; + remote_req.type = SCIF_WINDOW_PARTIAL; + remote_req.head = &ep->rma_info.remote_reg_list; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + + if (addr && cache) { + mutex_lock(&ep->rma_info.mmn_lock); + mmn = scif_find_mmu_notifier(current->mm, &ep->rma_info); + if (!mmn) + scif_add_mmu_notifier(current->mm, ep); + mutex_unlock(&ep->rma_info.mmn_lock); + if (IS_ERR(mmn)) { + scif_put_peer_dev(spdev); + return PTR_ERR(mmn); + } + cache = cache && !scif_rma_tc_can_cache(ep, len); + } + mutex_lock(&ep->rma_info.rma_lock); + if (addr) { + req.out_window = &local_window; + req.nr_bytes = ALIGN(len + (addr & ~PAGE_MASK), + PAGE_SIZE); + req.va_for_temp = addr & PAGE_MASK; + req.prot = (dir == SCIF_LOCAL_TO_REMOTE ? + VM_READ : VM_WRITE | VM_READ); + /* Does a valid local window exist? */ + if (mmn) { + spin_lock(&ep->rma_info.tc_lock); + req.head = &mmn->tc_reg_list; + err = scif_query_tcw(ep, &req); + spin_unlock(&ep->rma_info.tc_lock); + } + if (!mmn || err) { + err = scif_register_temp(epd, req.va_for_temp, + req.nr_bytes, req.prot, + &loffset, &local_window); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + if (!cache) + goto skip_cache; + atomic_inc(&ep->rma_info.tcw_refcount); + atomic_add_return(local_window->nr_pages, + &ep->rma_info.tcw_total_pages); + if (mmn) { + spin_lock(&ep->rma_info.tc_lock); + scif_insert_tcw(local_window, + &mmn->tc_reg_list); + spin_unlock(&ep->rma_info.tc_lock); + } + } +skip_cache: + loffset = local_window->offset + + (addr - local_window->va_for_temp); + } else { + req.out_window = &local_window; + req.offset = loffset; + /* + * If transfer is from local to remote then the self window + * must be readable and vice versa. + */ + req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_READ : VM_WRITE; + req.nr_bytes = len; + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.reg_list; + /* Does a valid local window exist? */ + err = scif_query_window(&req); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + } + + /* Does a valid remote window exist? */ + err = scif_query_window(&remote_req); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + + /* + * Prepare copy_work for submitting work to the DMA kernel thread + * or CPU copy routine. + */ + copy_work.len = len; + copy_work.loopback = loopback; + copy_work.remote_dev = ep->remote_dev; + if (dir == SCIF_LOCAL_TO_REMOTE) { + copy_work.src_offset = loffset; + copy_work.src_window = local_window; + copy_work.dst_offset = roffset; + copy_work.dst_window = remote_window; + } else { + copy_work.src_offset = roffset; + copy_work.src_window = remote_window; + copy_work.dst_offset = loffset; + copy_work.dst_window = local_window; + } + + if (flags & SCIF_RMA_USECPU) { + scif_rma_list_cpu_copy(©_work); + } else { + chan = ep->rma_info.dma_chan; + err = scif_rma_list_dma_copy_wrapper(epd, ©_work, + chan, loffset); + } + if (addr && !cache) + atomic_inc(&ep->rma_info.tw_refcount); + + mutex_unlock(&ep->rma_info.rma_lock); + + if (last_chunk) { + struct scif_dev *rdev = ep->remote_dev; + + if (copy_work.fence_type == SCIF_DMA_POLL) + err = scif_drain_dma_poll(rdev->sdev, + ep->rma_info.dma_chan); + else if (copy_work.fence_type == SCIF_DMA_INTR) + err = scif_drain_dma_intr(rdev->sdev, + ep->rma_info.dma_chan); + } + + if (addr && !cache) + scif_queue_for_cleanup(local_window, &scif_info.rma); + scif_put_peer_dev(spdev); + return err; +error: + if (err) { + if (addr && local_window && !cache) + scif_destroy_window(ep, local_window); + dev_err(scif_info.mdev.this_device, + "%s %d err %d len 0x%lx\n", + __func__, __LINE__, err, len); + } + scif_put_peer_dev(spdev); + return err; +} + +int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx offset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + if (scif_unaligned(loffset, roffset)) { + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, loffset, 0x0, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_REMOTE_TO_LOCAL, false); + if (err) + goto readfrom_err; + loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, loffset, 0x0, len, + roffset, flags, SCIF_REMOTE_TO_LOCAL, true); +readfrom_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_readfrom); + +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + if (scif_unaligned(loffset, roffset)) { + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, loffset, 0x0, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_LOCAL_TO_REMOTE, false); + if (err) + goto writeto_err; + loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, loffset, 0x0, len, + roffset, flags, SCIF_LOCAL_TO_REMOTE, true); +writeto_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_writeto); + +int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vreadfrom: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + if (scif_unaligned((off_t __force)addr, roffset)) { + if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, 0, (u64)addr, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_REMOTE_TO_LOCAL, false); + if (err) + goto vreadfrom_err; + addr += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, 0, (u64)addr, len, + roffset, flags, SCIF_REMOTE_TO_LOCAL, true); +vreadfrom_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_vreadfrom); + +int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vwriteto: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + if (scif_unaligned((off_t __force)addr, roffset)) { + if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, 0, (u64)addr, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_LOCAL_TO_REMOTE, false); + if (err) + goto vwriteto_err; + addr += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, 0, (u64)addr, len, + roffset, flags, SCIF_LOCAL_TO_REMOTE, true); +vwriteto_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_vwriteto); diff --git a/drivers/misc/mic/scif/scif_epd.c b/drivers/misc/mic/scif/scif_epd.c index b4bfbb08a..00e5d6d66 100644 --- a/drivers/misc/mic/scif/scif_epd.c +++ b/drivers/misc/mic/scif/scif_epd.c @@ -65,14 +65,14 @@ void scif_teardown_ep(void *endpt) void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held) { if (!eplock_held) - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); spin_lock(&ep->lock); ep->state = SCIFEP_ZOMBIE; spin_unlock(&ep->lock); list_add_tail(&ep->list, &scif_info.zombie); scif_info.nr_zombies++; if (!eplock_held) - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); schedule_work(&scif_info.misc_work); } @@ -81,16 +81,15 @@ static struct scif_endpt *scif_find_listen_ep(u16 port) struct scif_endpt *ep = NULL; struct list_head *pos, *tmpq; - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); list_for_each_safe(pos, tmpq, &scif_info.listen) { ep = list_entry(pos, struct scif_endpt, list); if (ep->port.port == port) { - spin_lock(&ep->lock); - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); return ep; } } - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); return NULL; } @@ -99,14 +98,17 @@ void scif_cleanup_zombie_epd(void) struct list_head *pos, *tmpq; struct scif_endpt *ep; - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); list_for_each_safe(pos, tmpq, &scif_info.zombie) { ep = list_entry(pos, struct scif_endpt, list); - list_del(pos); - scif_info.nr_zombies--; - kfree(ep); + if (scif_rma_ep_can_uninit(ep)) { + list_del(pos); + scif_info.nr_zombies--; + put_iova_domain(&ep->rma_info.iovad); + kfree(ep); + } } - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); } /** @@ -137,6 +139,8 @@ void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg) if (!ep) /* Send reject due to no listening ports */ goto conreq_sendrej_free; + else + spin_lock(&ep->lock); if (ep->backlog <= ep->conreqcnt) { /* Send reject due to too many pending requests */ diff --git a/drivers/misc/mic/scif/scif_epd.h b/drivers/misc/mic/scif/scif_epd.h index 331322a25..1771d7a9b 100644 --- a/drivers/misc/mic/scif/scif_epd.h +++ b/drivers/misc/mic/scif/scif_epd.h @@ -96,7 +96,11 @@ struct scif_endpt_qp_info { * @conn_port: Connection port * @conn_err: Errors during connection * @conn_async_state: Async connection + * @conn_pend_wq: Used by poll while waiting for incoming connections * @conn_list: List of async connection requests + * @rma_info: Information for triggering SCIF RMA and DMA operations + * @mmu_list: link to list of MMU notifier cleanup work + * @anon: anonymous file for use in kernel mode scif poll */ struct scif_endpt { enum scif_epd_state state; @@ -125,7 +129,11 @@ struct scif_endpt { struct scif_port_id conn_port; int conn_err; int conn_async_state; + wait_queue_head_t conn_pend_wq; struct list_head conn_list; + struct scif_endpt_rma_info rma_info; + struct list_head mmu_list; + struct file *anon; }; static inline int scifdev_alive(struct scif_endpt *ep) @@ -133,6 +141,43 @@ static inline int scifdev_alive(struct scif_endpt *ep) return _scifdev_alive(ep->remote_dev); } +/* + * scif_verify_epd: + * ep: SCIF endpoint + * + * Checks several generic error conditions and returns the + * appropriate error. + */ +static inline int scif_verify_epd(struct scif_endpt *ep) +{ + if (ep->state == SCIFEP_DISCONNECTED) + return -ECONNRESET; + + if (ep->state != SCIFEP_CONNECTED) + return -ENOTCONN; + + if (!scifdev_alive(ep)) + return -ENODEV; + + return 0; +} + +static inline int scif_anon_inode_getfile(scif_epd_t epd) +{ + epd->anon = anon_inode_getfile("scif", &scif_anon_fops, NULL, 0); + if (IS_ERR(epd->anon)) + return PTR_ERR(epd->anon); + return 0; +} + +static inline void scif_anon_inode_fput(scif_epd_t epd) +{ + if (epd->anon) { + fput(epd->anon); + epd->anon = NULL; + } +} + void scif_cleanup_zombie_epd(void); void scif_teardown_ep(void *endpt); void scif_cleanup_ep_qp(struct scif_endpt *ep); @@ -157,4 +202,9 @@ void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg); void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg); int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block); int __scif_flush(scif_epd_t epd); +int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd); +unsigned int __scif_pollfd(struct file *f, poll_table *wait, + struct scif_endpt *ep); +int __scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages); #endif /* SCIF_EPD_H */ diff --git a/drivers/misc/mic/scif/scif_fd.c b/drivers/misc/mic/scif/scif_fd.c index eccf7e713..f7e826142 100644 --- a/drivers/misc/mic/scif/scif_fd.c +++ b/drivers/misc/mic/scif/scif_fd.c @@ -34,6 +34,20 @@ static int scif_fdclose(struct inode *inode, struct file *f) return scif_close(priv); } +static int scif_fdmmap(struct file *f, struct vm_area_struct *vma) +{ + struct scif_endpt *priv = f->private_data; + + return scif_mmap(vma, priv); +} + +static unsigned int scif_fdpoll(struct file *f, poll_table *wait) +{ + struct scif_endpt *priv = f->private_data; + + return __scif_pollfd(f, wait, priv); +} + static int scif_fdflush(struct file *f, fl_owner_t id) { struct scif_endpt *ep = f->private_data; @@ -140,12 +154,12 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) * Add to the list of user mode eps where the second half * of the accept is not yet completed. */ - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); list_add_tail(&((*ep)->miacceptlist), &scif_info.uaccept); list_add_tail(&((*ep)->liacceptlist), &priv->li_accept); (*ep)->listenep = priv; priv->acceptcnt++; - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); return 0; } @@ -163,7 +177,7 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EFAULT; /* Remove form the user accept queue */ - spin_lock(&scif_info.eplock); + mutex_lock(&scif_info.eplock); list_for_each_safe(pos, tmpq, &scif_info.uaccept) { tmpep = list_entry(pos, struct scif_endpt, miacceptlist); @@ -175,7 +189,7 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) } if (!fep) { - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); return -ENOENT; } @@ -190,9 +204,10 @@ static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) } } - spin_unlock(&scif_info.eplock); + mutex_unlock(&scif_info.eplock); /* Free the resources automatically created from the open. */ + scif_anon_inode_fput(priv); scif_teardown_ep(priv); scif_add_epd_to_zombie_list(priv, !SCIF_EPLOCK_HELD); f->private_data = newep; @@ -290,6 +305,157 @@ getnodes_err1: getnodes_err2: return err; } + case SCIF_REG: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_reg reg; + off_t ret; + + if (copy_from_user(®, argp, sizeof(reg))) { + err = -EFAULT; + goto reg_err; + } + if (reg.flags & SCIF_MAP_KERNEL) { + err = -EINVAL; + goto reg_err; + } + ret = scif_register(priv, (void *)reg.addr, reg.len, + reg.offset, reg.prot, reg.flags); + if (ret < 0) { + err = (int)ret; + goto reg_err; + } + + if (copy_to_user(&((struct scifioctl_reg __user *)argp) + ->out_offset, &ret, sizeof(reg.out_offset))) { + err = -EFAULT; + goto reg_err; + } + err = 0; +reg_err: + scif_err_debug(err, "scif_register"); + return err; + } + case SCIF_UNREG: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_unreg unreg; + + if (copy_from_user(&unreg, argp, sizeof(unreg))) { + err = -EFAULT; + goto unreg_err; + } + err = scif_unregister(priv, unreg.offset, unreg.len); +unreg_err: + scif_err_debug(err, "scif_unregister"); + return err; + } + case SCIF_READFROM: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto readfrom_err; + } + err = scif_readfrom(priv, copy.loffset, copy.len, copy.roffset, + copy.flags); +readfrom_err: + scif_err_debug(err, "scif_readfrom"); + return err; + } + case SCIF_WRITETO: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto writeto_err; + } + err = scif_writeto(priv, copy.loffset, copy.len, copy.roffset, + copy.flags); +writeto_err: + scif_err_debug(err, "scif_writeto"); + return err; + } + case SCIF_VREADFROM: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vreadfrom_err; + } + err = scif_vreadfrom(priv, (void __force *)copy.addr, copy.len, + copy.roffset, copy.flags); +vreadfrom_err: + scif_err_debug(err, "scif_vreadfrom"); + return err; + } + case SCIF_VWRITETO: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vwriteto_err; + } + err = scif_vwriteto(priv, (void __force *)copy.addr, copy.len, + copy.roffset, copy.flags); +vwriteto_err: + scif_err_debug(err, "scif_vwriteto"); + return err; + } + case SCIF_FENCE_MARK: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_fence_mark mark; + int tmp_mark = 0; + + if (copy_from_user(&mark, argp, sizeof(mark))) { + err = -EFAULT; + goto fence_mark_err; + } + err = scif_fence_mark(priv, mark.flags, &tmp_mark); + if (err) + goto fence_mark_err; + if (copy_to_user((void __user *)mark.mark, &tmp_mark, + sizeof(tmp_mark))) { + err = -EFAULT; + goto fence_mark_err; + } +fence_mark_err: + scif_err_debug(err, "scif_fence_mark"); + return err; + } + case SCIF_FENCE_WAIT: + { + struct scif_endpt *priv = f->private_data; + + err = scif_fence_wait(priv, arg); + scif_err_debug(err, "scif_fence_wait"); + return err; + } + case SCIF_FENCE_SIGNAL: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_fence_signal signal; + + if (copy_from_user(&signal, argp, sizeof(signal))) { + err = -EFAULT; + goto fence_signal_err; + } + + err = scif_fence_signal(priv, signal.loff, signal.lval, + signal.roff, signal.rval, signal.flags); +fence_signal_err: + scif_err_debug(err, "scif_fence_signal"); + return err; + } } return -EINVAL; } @@ -298,6 +464,8 @@ const struct file_operations scif_fops = { .open = scif_fdopen, .release = scif_fdclose, .unlocked_ioctl = scif_fdioctl, + .mmap = scif_fdmmap, + .poll = scif_fdpoll, .flush = scif_fdflush, .owner = THIS_MODULE, }; diff --git a/drivers/misc/mic/scif/scif_fence.c b/drivers/misc/mic/scif/scif_fence.c new file mode 100644 index 000000000..7f2c96f57 --- /dev/null +++ b/drivers/misc/mic/scif/scif_fence.c @@ -0,0 +1,771 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ + +#include "scif_main.h" + +/** + * scif_recv_mark: Handle SCIF_MARK request + * @msg: Interrupt message + * + * The peer has requested a mark. + */ +void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int mark, err; + + err = _scif_fence_mark(ep, &mark); + if (err) + msg->uop = SCIF_MARK_NACK; + else + msg->uop = SCIF_MARK_ACK; + msg->payload[0] = ep->remote_ep; + msg->payload[2] = mark; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_MARK message. + */ +void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_MARK_ACK) { + fence_req->state = OP_COMPLETED; + fence_req->dma_mark = (int)msg->payload[2]; + } else { + fence_req->state = OP_FAILED; + } + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +/** + * scif_recv_wait: Handle SCIF_WAIT request + * @msg: Interrupt message + * + * The peer has requested waiting on a fence. + */ +void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_remote_fence_info *fence; + + /* + * Allocate structure for remote fence information and + * send a NACK if the allocation failed. The peer will + * return ENOMEM upon receiving a NACK. + */ + fence = kmalloc(sizeof(*fence), GFP_KERNEL); + if (!fence) { + msg->payload[0] = ep->remote_ep; + msg->uop = SCIF_WAIT_NACK; + scif_nodeqp_send(ep->remote_dev, msg); + return; + } + + /* Prepare the fence request */ + memcpy(&fence->msg, msg, sizeof(struct scifmsg)); + INIT_LIST_HEAD(&fence->list); + + /* Insert to the global remote fence request list */ + mutex_lock(&scif_info.fencelock); + atomic_inc(&ep->rma_info.fence_refcount); + list_add_tail(&fence->list, &scif_info.fence); + mutex_unlock(&scif_info.fencelock); + + schedule_work(&scif_info.misc_work); +} + +/** + * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_WAIT message. + */ +void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_WAIT_ACK) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +/** + * scif_recv_sig_local: Handle SCIF_SIG_LOCAL request + * @msg: Interrupt message + * + * The peer has requested a signal on a local offset. + */ +void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int err; + + err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], + SCIF_WINDOW_SELF); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_sig_remote: Handle SCIF_SIGNAL_REMOTE request + * @msg: Interrupt message + * + * The peer has requested a signal on a remote offset. + */ +void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int err; + + err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], + SCIF_WINDOW_PEER); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_sig_resp: Handle SCIF_SIG_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a signal request. + */ +void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[3]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_SIG_ACK) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +static inline void *scif_get_local_va(off_t off, struct scif_window *window) +{ + struct page **pages = window->pinned_pages->pages; + int page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + + return page_address(pages[page_nr]) + page_off; +} + +static void scif_prog_signal_cb(void *arg) +{ + struct scif_status *status = arg; + + dma_pool_free(status->ep->remote_dev->signal_pool, status, + status->src_dma_addr); +} + +static int _scif_prog_signal(scif_epd_t epd, dma_addr_t dst, u64 val) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct dma_chan *chan = ep->rma_info.dma_chan; + struct dma_device *ddev = chan->device; + bool x100 = !is_dma_copy_aligned(chan->device, 1, 1, 1); + struct dma_async_tx_descriptor *tx; + struct scif_status *status = NULL; + dma_addr_t src; + dma_cookie_t cookie; + int err; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + dma_async_issue_pending(chan); + if (x100) { + /* + * For X100 use the status descriptor to write the value to + * the destination. + */ + tx = ddev->device_prep_dma_imm_data(chan, dst, val, 0); + } else { + status = dma_pool_alloc(ep->remote_dev->signal_pool, GFP_KERNEL, + &src); + if (!status) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + status->val = val; + status->src_dma_addr = src; + status->ep = ep; + src += offsetof(struct scif_status, val); + tx = ddev->device_prep_dma_memcpy(chan, dst, src, sizeof(val), + DMA_PREP_INTERRUPT); + } + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto dma_fail; + } + if (!x100) { + tx->callback = scif_prog_signal_cb; + tx->callback_param = status; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = -EIO; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto dma_fail; + } + dma_async_issue_pending(chan); + return 0; +dma_fail: + if (!x100) + dma_pool_free(ep->remote_dev->signal_pool, status, + status->src_dma_addr); +alloc_fail: + return err; +} + +/* + * scif_prog_signal: + * @epd - Endpoint Descriptor + * @offset - registered address to write @val to + * @val - Value to be written at @offset + * @type - Type of the window. + * + * Arrange to write a value to the registered offset after ensuring that the + * offset provided is indeed valid. + */ +int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, + enum scif_window_type type) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_window *window = NULL; + struct scif_rma_req req; + dma_addr_t dst_dma_addr; + int err; + + mutex_lock(&ep->rma_info.rma_lock); + req.out_window = &window; + req.offset = offset; + req.nr_bytes = sizeof(u64); + req.prot = SCIF_PROT_WRITE; + req.type = SCIF_WINDOW_SINGLE; + if (type == SCIF_WINDOW_SELF) + req.head = &ep->rma_info.reg_list; + else + req.head = &ep->rma_info.remote_reg_list; + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + goto unlock_ret; + } + + if (scif_is_mgmt_node() && scifdev_self(ep->remote_dev)) { + u64 *dst_virt; + + if (type == SCIF_WINDOW_SELF) + dst_virt = scif_get_local_va(offset, window); + else + dst_virt = + scif_get_local_va(offset, (struct scif_window *) + window->peer_window); + *dst_virt = val; + } else { + dst_dma_addr = __scif_off_to_dma_addr(window, offset); + err = _scif_prog_signal(epd, dst_dma_addr, val); + } +unlock_ret: + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +static int _scif_fence_wait(scif_epd_t epd, int mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + dma_cookie_t cookie = mark & ~SCIF_REMOTE_FENCE; + int err; + + /* Wait for DMA callback in scif_fence_mark_cb(..) */ + err = wait_event_interruptible_timeout(ep->rma_info.markwq, + dma_async_is_tx_complete( + ep->rma_info.dma_chan, + cookie, NULL, NULL) == + DMA_COMPLETE, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err) + err = -ETIMEDOUT; + else if (err > 0) + err = 0; + return err; +} + +/** + * scif_rma_handle_remote_fences: + * + * This routine services remote fence requests. + */ +void scif_rma_handle_remote_fences(void) +{ + struct list_head *item, *tmp; + struct scif_remote_fence_info *fence; + struct scif_endpt *ep; + int mark, err; + + might_sleep(); + mutex_lock(&scif_info.fencelock); + list_for_each_safe(item, tmp, &scif_info.fence) { + fence = list_entry(item, struct scif_remote_fence_info, + list); + /* Remove fence from global list */ + list_del(&fence->list); + + /* Initiate the fence operation */ + ep = (struct scif_endpt *)fence->msg.payload[0]; + mark = fence->msg.payload[2]; + err = _scif_fence_wait(ep, mark); + if (err) + fence->msg.uop = SCIF_WAIT_NACK; + else + fence->msg.uop = SCIF_WAIT_ACK; + fence->msg.payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, &fence->msg); + kfree(fence); + if (!atomic_sub_return(1, &ep->rma_info.fence_refcount)) + schedule_work(&scif_info.misc_work); + } + mutex_unlock(&scif_info.fencelock); +} + +static int _scif_send_fence(scif_epd_t epd, int uop, int mark, int *out_mark) +{ + int err; + struct scifmsg msg; + struct scif_fence_info *fence_req; + struct scif_endpt *ep = (struct scif_endpt *)epd; + + fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); + if (!fence_req) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_completion(&fence_req->comp); + + msg.src = ep->port; + msg.uop = uop; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (u64)fence_req; + if (uop == SCIF_WAIT) + msg.payload[2] = mark; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; +retry: + /* Wait for a SCIF_WAIT_(N)ACK message */ + err = wait_for_completion_timeout(&fence_req->comp, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + mutex_lock(&ep->rma_info.rma_lock); + if (err < 0) { + if (fence_req->state == OP_IN_PROGRESS) + fence_req->state = OP_FAILED; + } + if (fence_req->state == OP_FAILED && !err) + err = -ENOMEM; + if (uop == SCIF_MARK && fence_req->state == OP_COMPLETED) + *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark; + mutex_unlock(&ep->rma_info.rma_lock); +error_free: + kfree(fence_req); +error: + return err; +} + +/** + * scif_send_fence_mark: + * @epd: end point descriptor. + * @out_mark: Output DMA mark reported by peer. + * + * Send a remote fence mark request. + */ +static int scif_send_fence_mark(scif_epd_t epd, int *out_mark) +{ + return _scif_send_fence(epd, SCIF_MARK, 0, out_mark); +} + +/** + * scif_send_fence_wait: + * @epd: end point descriptor. + * @mark: DMA mark to wait for. + * + * Send a remote fence wait request. + */ +static int scif_send_fence_wait(scif_epd_t epd, int mark) +{ + return _scif_send_fence(epd, SCIF_WAIT, mark, NULL); +} + +static int _scif_send_fence_signal_wait(struct scif_endpt *ep, + struct scif_fence_info *fence_req) +{ + int err; + +retry: + /* Wait for a SCIF_SIG_(N)ACK message */ + err = wait_for_completion_timeout(&fence_req->comp, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (fence_req->state == OP_IN_PROGRESS) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (fence_req->state == OP_FAILED && !err) + err = -ENXIO; + return err; +} + +/** + * scif_send_fence_signal: + * @epd - endpoint descriptor + * @loff - local offset + * @lval - local value to write to loffset + * @roff - remote offset + * @rval - remote value to write to roffset + * @flags - flags + * + * Sends a remote fence signal request + */ +static int scif_send_fence_signal(scif_epd_t epd, off_t roff, u64 rval, + off_t loff, u64 lval, int flags) +{ + int err = 0; + struct scifmsg msg; + struct scif_fence_info *fence_req; + struct scif_endpt *ep = (struct scif_endpt *)epd; + + fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); + if (!fence_req) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_completion(&fence_req->comp); + msg.src = ep->port; + if (flags & SCIF_SIGNAL_LOCAL) { + msg.uop = SCIF_SIG_LOCAL; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = roff; + msg.payload[2] = rval; + msg.payload[3] = (u64)fence_req; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; + err = _scif_send_fence_signal_wait(ep, fence_req); + if (err) + goto error_free; + } + fence_req->state = OP_IN_PROGRESS; + + if (flags & SCIF_SIGNAL_REMOTE) { + msg.uop = SCIF_SIG_REMOTE; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = loff; + msg.payload[2] = lval; + msg.payload[3] = (u64)fence_req; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; + err = _scif_send_fence_signal_wait(ep, fence_req); + } +error_free: + kfree(fence_req); +error: + return err; +} + +static void scif_fence_mark_cb(void *arg) +{ + struct scif_endpt *ep = (struct scif_endpt *)arg; + + wake_up_interruptible(&ep->rma_info.markwq); + atomic_dec(&ep->rma_info.fence_refcount); +} + +/* + * _scif_fence_mark: + * + * @epd - endpoint descriptor + * Set up a mark for this endpoint and return the value of the mark. + */ +int _scif_fence_mark(scif_epd_t epd, int *mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct dma_chan *chan = ep->rma_info.dma_chan; + struct dma_device *ddev = chan->device; + struct dma_async_tx_descriptor *tx; + dma_cookie_t cookie; + int err; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + dma_async_issue_pending(chan); + tx = ddev->device_prep_dma_interrupt(chan, DMA_PREP_INTERRUPT); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + tx->callback = scif_fence_mark_cb; + tx->callback_param = ep; + *mark = cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + atomic_inc(&ep->rma_info.fence_refcount); + dma_async_issue_pending(chan); + return 0; +} + +#define SCIF_LOOPB_MAGIC_MARK 0xdead + +int scif_fence_mark(scif_epd_t epd, int flags, int *mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x\n", + ep, flags, *mark); + err = scif_verify_epd(ep); + if (err) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* + * Management node loopback does not need to use DMA. + * Return a valid mark to be symmetric. + */ + if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { + *mark = SCIF_LOOPB_MAGIC_MARK; + return 0; + } + + if (flags & SCIF_FENCE_INIT_SELF) + err = _scif_fence_mark(epd, mark); + else + err = scif_send_fence_mark(ep, mark); + + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x err %d\n", + ep, flags, *mark, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_mark); + +int scif_fence_wait(scif_epd_t epd, int mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_wait: ep %p mark 0x%x\n", + ep, mark); + err = scif_verify_epd(ep); + if (err) + return err; + /* + * Management node loopback does not need to use DMA. + * The only valid mark provided is 0 so simply + * return success if the mark is valid. + */ + if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { + if (mark == SCIF_LOOPB_MAGIC_MARK) + return 0; + else + return -EINVAL; + } + if (mark & SCIF_REMOTE_FENCE) + err = scif_send_fence_wait(epd, mark); + else + err = _scif_fence_wait(epd, mark); + if (err < 0) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_wait); + +int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, + off_t roff, u64 rval, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_signal: ep %p loff 0x%lx lval 0x%llx roff 0x%lx rval 0x%llx flags 0x%x\n", + ep, loff, lval, roff, rval, flags); + err = scif_verify_epd(ep); + if (err) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER | + SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */ + if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))) + return -EINVAL; + + /* Only Dword offsets allowed */ + if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(u32) - 1))) + return -EINVAL; + + /* Only Dword aligned offsets allowed */ + if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(u32) - 1))) + return -EINVAL; + + if (flags & SCIF_FENCE_INIT_PEER) { + err = scif_send_fence_signal(epd, roff, rval, loff, + lval, flags); + } else { + /* Local Signal in Local RAS */ + if (flags & SCIF_SIGNAL_LOCAL) { + err = scif_prog_signal(epd, loff, lval, + SCIF_WINDOW_SELF); + if (err) + goto error_ret; + } + + /* Signal in Remote RAS */ + if (flags & SCIF_SIGNAL_REMOTE) + err = scif_prog_signal(epd, roff, + rval, SCIF_WINDOW_PEER); + } +error_ret: + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_signal); diff --git a/drivers/misc/mic/scif/scif_main.c b/drivers/misc/mic/scif/scif_main.c index 6ce851f5c..36d847af1 100644 --- a/drivers/misc/mic/scif/scif_main.c +++ b/drivers/misc/mic/scif/scif_main.c @@ -34,6 +34,7 @@ struct scif_info scif_info = { }; struct scif_dev *scif_dev; +struct kmem_cache *unaligned_cache; static atomic_t g_loopb_cnt; /* Runs in the context of intr_wq */ @@ -80,35 +81,6 @@ irqreturn_t scif_intr_handler(int irq, void *data) return IRQ_HANDLED; } -static int scif_peer_probe(struct scif_peer_dev *spdev) -{ - struct scif_dev *scifdev = &scif_dev[spdev->dnode]; - - mutex_lock(&scif_info.conflock); - scif_info.total++; - scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid); - mutex_unlock(&scif_info.conflock); - rcu_assign_pointer(scifdev->spdev, spdev); - - /* In the future SCIF kernel client devices will be added here */ - return 0; -} - -static void scif_peer_remove(struct scif_peer_dev *spdev) -{ - struct scif_dev *scifdev = &scif_dev[spdev->dnode]; - - /* In the future SCIF kernel client devices will be removed here */ - spdev = rcu_dereference(scifdev->spdev); - if (spdev) - RCU_INIT_POINTER(scifdev->spdev, NULL); - synchronize_rcu(); - - mutex_lock(&scif_info.conflock); - scif_info.total--; - mutex_unlock(&scif_info.conflock); -} - static void scif_qp_setup_handler(struct work_struct *work) { struct scif_dev *scifdev = container_of(work, struct scif_dev, @@ -139,20 +111,13 @@ static void scif_qp_setup_handler(struct work_struct *work) } } -static int scif_setup_scifdev(struct scif_hw_dev *sdev) +static int scif_setup_scifdev(void) { + /* We support a maximum of 129 SCIF nodes including the mgmt node */ +#define MAX_SCIF_NODES 129 int i; - u8 num_nodes; - - if (sdev->snode) { - struct mic_bootparam __iomem *bp = sdev->rdp; - - num_nodes = ioread8(&bp->tot_nodes); - } else { - struct mic_bootparam *bp = sdev->dp; + u8 num_nodes = MAX_SCIF_NODES; - num_nodes = bp->tot_nodes; - } scif_dev = kcalloc(num_nodes, sizeof(*scif_dev), GFP_KERNEL); if (!scif_dev) return -ENOMEM; @@ -163,7 +128,7 @@ static int scif_setup_scifdev(struct scif_hw_dev *sdev) scifdev->exit = OP_IDLE; init_waitqueue_head(&scifdev->disconn_wq); mutex_init(&scifdev->lock); - INIT_WORK(&scifdev->init_msg_work, scif_qp_response_ack); + INIT_WORK(&scifdev->peer_add_work, scif_add_peer_device); INIT_DELAYED_WORK(&scifdev->p2p_dwork, scif_poll_qp_state); INIT_DELAYED_WORK(&scifdev->qp_dwork, @@ -181,27 +146,21 @@ static void scif_destroy_scifdev(void) static int scif_probe(struct scif_hw_dev *sdev) { - struct scif_dev *scifdev; + struct scif_dev *scifdev = &scif_dev[sdev->dnode]; int rc; dev_set_drvdata(&sdev->dev, sdev); + scifdev->sdev = sdev; + if (1 == atomic_add_return(1, &g_loopb_cnt)) { - struct scif_dev *loopb_dev; + struct scif_dev *loopb_dev = &scif_dev[sdev->snode]; - rc = scif_setup_scifdev(sdev); - if (rc) - goto exit; - scifdev = &scif_dev[sdev->dnode]; - scifdev->sdev = sdev; - loopb_dev = &scif_dev[sdev->snode]; loopb_dev->sdev = sdev; rc = scif_setup_loopback_qp(loopb_dev); if (rc) - goto free_sdev; - } else { - scifdev = &scif_dev[sdev->dnode]; - scifdev->sdev = sdev; + goto exit; } + rc = scif_setup_intr_wq(scifdev); if (rc) goto destroy_loopb; @@ -237,8 +196,6 @@ destroy_intr: destroy_loopb: if (atomic_dec_and_test(&g_loopb_cnt)) scif_destroy_loopback_qp(&scif_dev[sdev->snode]); -free_sdev: - scif_destroy_scifdev(); exit: return rc; } @@ -290,13 +247,6 @@ static void scif_remove(struct scif_hw_dev *sdev) scifdev->sdev = NULL; } -static struct scif_peer_driver scif_peer_driver = { - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .probe = scif_peer_probe, - .remove = scif_peer_remove, -}; - static struct scif_hw_dev_id id_table[] = { { MIC_SCIF_DEV, SCIF_DEV_ANY_ID }, { 0 }, @@ -312,29 +262,54 @@ static struct scif_driver scif_driver = { static int _scif_init(void) { - spin_lock_init(&scif_info.eplock); + int rc; + + mutex_init(&scif_info.eplock); + spin_lock_init(&scif_info.rmalock); spin_lock_init(&scif_info.nb_connect_lock); spin_lock_init(&scif_info.port_lock); mutex_init(&scif_info.conflock); mutex_init(&scif_info.connlock); + mutex_init(&scif_info.fencelock); INIT_LIST_HEAD(&scif_info.uaccept); INIT_LIST_HEAD(&scif_info.listen); INIT_LIST_HEAD(&scif_info.zombie); INIT_LIST_HEAD(&scif_info.connected); INIT_LIST_HEAD(&scif_info.disconnected); + INIT_LIST_HEAD(&scif_info.rma); + INIT_LIST_HEAD(&scif_info.rma_tc); + INIT_LIST_HEAD(&scif_info.mmu_notif_cleanup); + INIT_LIST_HEAD(&scif_info.fence); INIT_LIST_HEAD(&scif_info.nb_connect_list); init_waitqueue_head(&scif_info.exitwq); + scif_info.rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT; scif_info.en_msg_log = 0; scif_info.p2p_enable = 1; + rc = scif_setup_scifdev(); + if (rc) + goto error; + unaligned_cache = kmem_cache_create("Unaligned_DMA", + SCIF_KMEM_UNALIGNED_BUF_SIZE, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!unaligned_cache) { + rc = -ENOMEM; + goto free_sdev; + } INIT_WORK(&scif_info.misc_work, scif_misc_handler); + INIT_WORK(&scif_info.mmu_notif_work, scif_mmu_notif_handler); INIT_WORK(&scif_info.conn_work, scif_conn_handler); idr_init(&scif_ports); return 0; +free_sdev: + scif_destroy_scifdev(); +error: + return rc; } static void _scif_exit(void) { idr_destroy(&scif_ports); + kmem_cache_destroy(unaligned_cache); scif_destroy_scifdev(); } @@ -344,15 +319,13 @@ static int __init scif_init(void) int rc; _scif_init(); + iova_cache_get(); rc = scif_peer_bus_init(); if (rc) goto exit; - rc = scif_peer_register_driver(&scif_peer_driver); - if (rc) - goto peer_bus_exit; rc = scif_register_driver(&scif_driver); if (rc) - goto unreg_scif_peer; + goto peer_bus_exit; rc = misc_register(mdev); if (rc) goto unreg_scif; @@ -360,8 +333,6 @@ static int __init scif_init(void) return 0; unreg_scif: scif_unregister_driver(&scif_driver); -unreg_scif_peer: - scif_peer_unregister_driver(&scif_peer_driver); peer_bus_exit: scif_peer_bus_exit(); exit: @@ -374,8 +345,8 @@ static void __exit scif_exit(void) scif_exit_debugfs(); misc_deregister(&scif_info.mdev); scif_unregister_driver(&scif_driver); - scif_peer_unregister_driver(&scif_peer_driver); scif_peer_bus_exit(); + iova_cache_put(); _scif_exit(); } diff --git a/drivers/misc/mic/scif/scif_main.h b/drivers/misc/mic/scif/scif_main.h index 580bc63e1..a08f0b600 100644 --- a/drivers/misc/mic/scif/scif_main.h +++ b/drivers/misc/mic/scif/scif_main.h @@ -22,15 +22,18 @@ #include <linux/pci.h> #include <linux/miscdevice.h> #include <linux/dmaengine.h> +#include <linux/iova.h> +#include <linux/anon_inodes.h> #include <linux/file.h> +#include <linux/vmalloc.h> #include <linux/scif.h> - #include "../common/mic_dev.h" #define SCIF_MGMT_NODE 0 #define SCIF_DEFAULT_WATCHDOG_TO 30 #define SCIF_NODE_ACCEPT_TIMEOUT (3 * HZ) #define SCIF_NODE_ALIVE_TIMEOUT (SCIF_DEFAULT_WATCHDOG_TO * HZ) +#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000 /* * Generic state used for certain node QP message exchanges @@ -73,13 +76,21 @@ enum scif_msg_state { * @loopb_work: Used for submitting work to loopb_wq * @loopb_recv_q: List of messages received on the loopb_wq * @card_initiated_exit: set when the card has initiated the exit + * @rmalock: Synchronize access to RMA operations + * @fencelock: Synchronize access to list of remote fences requested. + * @rma: List of temporary registered windows to be destroyed. + * @rma_tc: List of temporary registered & cached Windows to be destroyed + * @fence: List of remote fence requests + * @mmu_notif_work: Work for registration caching MMU notifier workqueue + * @mmu_notif_cleanup: List of temporary cached windows for reg cache + * @rma_tc_limit: RMA temporary cache limit */ struct scif_info { u8 nodeid; u8 maxid; u8 total; u32 nr_zombies; - spinlock_t eplock; + struct mutex eplock; struct mutex connlock; spinlock_t nb_connect_lock; spinlock_t port_lock; @@ -102,6 +113,14 @@ struct scif_info { struct work_struct loopb_work; struct list_head loopb_recv_q; bool card_initiated_exit; + spinlock_t rmalock; + struct mutex fencelock; + struct list_head rma; + struct list_head rma_tc; + struct list_head fence; + struct work_struct mmu_notif_work; + struct list_head mmu_notif_cleanup; + unsigned long rma_tc_limit; }; /* @@ -139,7 +158,7 @@ struct scif_p2p_info { * @db: doorbell the peer will trigger to generate an interrupt on self * @rdb: Doorbell to trigger on the peer to generate an interrupt on the peer * @cookie: Cookie received while registering the interrupt handler - * init_msg_work: work scheduled for SCIF_INIT message processing + * @peer_add_work: Work for handling device_add for peer devices * @p2p_dwork: Delayed work to enable polling for P2P state * @qp_dwork: Delayed work for enabling polling for remote QP information * @p2p_retry: Number of times to retry polling of P2P state @@ -152,6 +171,8 @@ struct scif_p2p_info { * @disconn_rescnt: Keeps track of number of node remove requests sent * @exit: Status of exit message * @qp_dma_addr: Queue pair DMA address passed to the peer + * @dma_ch_idx: Round robin index for DMA channels + * @signal_pool: DMA pool used for scheduling scif_fence_signal DMA's */ struct scif_dev { u8 node; @@ -165,7 +186,7 @@ struct scif_dev { int db; int rdb; struct mic_irq *cookie; - struct work_struct init_msg_work; + struct work_struct peer_add_work; struct delayed_work p2p_dwork; struct delayed_work qp_dwork; int p2p_retry; @@ -178,17 +199,25 @@ struct scif_dev { atomic_t disconn_rescnt; enum scif_msg_state exit; dma_addr_t qp_dma_addr; + int dma_ch_idx; + struct dma_pool *signal_pool; }; +extern bool scif_reg_cache_enable; +extern bool scif_ulimit_check; extern struct scif_info scif_info; extern struct idr scif_ports; +extern struct bus_type scif_peer_bus; extern struct scif_dev *scif_dev; extern const struct file_operations scif_fops; +extern const struct file_operations scif_anon_fops; /* Size of the RB for the Node QP */ #define SCIF_NODE_QP_SIZE 0x10000 #include "scif_nodeqp.h" +#include "scif_rma.h" +#include "scif_rma_list.h" /* * scifdev_self: diff --git a/drivers/misc/mic/scif/scif_map.h b/drivers/misc/mic/scif/scif_map.h index 20e50b4e1..3e86360ba 100644 --- a/drivers/misc/mic/scif/scif_map.h +++ b/drivers/misc/mic/scif/scif_map.h @@ -80,7 +80,7 @@ scif_unmap_single(dma_addr_t local, struct scif_dev *scifdev, size_t size) { if (!scifdev_self(scifdev)) { - if (scifdev_is_p2p(scifdev) && local > scifdev->base_addr) + if (scifdev_is_p2p(scifdev)) local = local - scifdev->base_addr; dma_unmap_single(&scifdev->sdev->dev, local, size, DMA_BIDIRECTIONAL); @@ -110,4 +110,27 @@ scif_iounmap(void *virt, size_t len, struct scif_dev *scifdev) sdev->hw_ops->iounmap(sdev, (void __force __iomem *)virt); } } + +static __always_inline int +scif_map_page(dma_addr_t *dma_handle, struct page *page, + struct scif_dev *scifdev) +{ + int err = 0; + + if (scifdev_self(scifdev)) { + *dma_handle = page_to_phys(page); + } else { + struct scif_hw_dev *sdev = scifdev->sdev; + *dma_handle = dma_map_page(&sdev->dev, + page, 0x0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(&sdev->dev, *dma_handle)) + err = -ENOMEM; + else if (scifdev_is_p2p(scifdev)) + *dma_handle = *dma_handle + scifdev->base_addr; + } + if (err) + *dma_handle = 0; + return err; +} #endif /* SCIF_MAP_H */ diff --git a/drivers/misc/mic/scif/scif_mmap.c b/drivers/misc/mic/scif/scif_mmap.c new file mode 100644 index 000000000..49cb8f7b4 --- /dev/null +++ b/drivers/misc/mic/scif/scif_mmap.c @@ -0,0 +1,699 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" + +/* + * struct scif_vma_info - Information about a remote memory mapping + * created via scif_mmap(..) + * @vma: VM area struct + * @list: link to list of active vmas + */ +struct scif_vma_info { + struct vm_area_struct *vma; + struct list_head list; +}; + +void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_window *recv_window = + (struct scif_window *)msg->payload[0]; + struct scif_endpt *ep; + + ep = (struct scif_endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = recv_window->prot; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if (scif_query_window(&req)) { + dev_err(&scifdev->sdev->dev, + "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + + scif_put_window(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + ep->rma_info.async_list_del = 1; + list_del_init(&window->list); + scif_free_window_offset(ep, window, window->offset); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (window && !window->ref_count) + scif_queue_for_cleanup(window, &scif_info.rma); +} + +/* + * Remove valid remote memory mappings created via scif_mmap(..) from the + * process address space since the remote node is lost + */ +static void __scif_zap_mmaps(struct scif_endpt *ep) +{ + struct list_head *item; + struct scif_vma_info *info; + struct vm_area_struct *vma; + unsigned long size; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.vma_list) { + info = list_entry(item, struct scif_vma_info, list); + vma = info->vma; + size = vma->vm_end - vma->vm_start; + zap_vma_ptes(vma, vma->vm_start, size); + dev_dbg(scif_info.mdev.this_device, + "%s ep %p zap vma %p size 0x%lx\n", + __func__, ep, info->vma, size); + } + spin_unlock(&ep->lock); +} + +/* + * Traverse the list of endpoints for a particular remote node and + * zap valid remote memory mappings since the remote node is lost + */ +static void _scif_zap_mmaps(int node, struct list_head *head) +{ + struct scif_endpt *ep; + struct list_head *item; + + mutex_lock(&scif_info.connlock); + list_for_each(item, head) { + ep = list_entry(item, struct scif_endpt, list); + if (ep->remote_dev->node == node) + __scif_zap_mmaps(ep); + } + mutex_unlock(&scif_info.connlock); +} + +/* + * Wrapper for removing remote memory mappings for a particular node. This API + * is called by peer nodes as part of handling a lost node. + */ +void scif_zap_mmaps(int node) +{ + _scif_zap_mmaps(node, &scif_info.connected); + _scif_zap_mmaps(node, &scif_info.disconnected); +} + +/* + * This API is only called while handling a lost node: + * a) Remote node is dead. + * b) Remote memory mappings have been zapped + * So we can traverse the remote_reg_list without any locks. Since + * the window has not yet been unregistered we can drop the ref count + * and queue it to the cleanup thread. + */ +static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep) +{ + struct list_head *pos, *tmp; + struct scif_window *window; + + list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(pos, struct scif_window, list); + if (window->ref_count) + scif_put_window(window, window->nr_pages); + else + dev_err(scif_info.mdev.this_device, + "%s %d unexpected\n", + __func__, __LINE__); + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + list_del_init(&window->list); + scif_queue_for_cleanup(window, &scif_info.rma); + } + } +} + +/* Cleanup remote registration lists for zombie endpoints */ +void scif_cleanup_rma_for_zombies(int node) +{ + struct scif_endpt *ep; + struct list_head *item; + + mutex_lock(&scif_info.eplock); + list_for_each(item, &scif_info.zombie) { + ep = list_entry(item, struct scif_endpt, list); + if (ep->remote_dev && ep->remote_dev->node == node) + __scif_cleanup_rma_for_zombies(ep); + } + mutex_unlock(&scif_info.eplock); + flush_work(&scif_info.misc_work); +} + +/* Insert the VMA into the per endpoint VMA list */ +static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma) +{ + struct scif_vma_info *info; + int err = 0; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto done; + } + info->vma = vma; + spin_lock(&ep->lock); + list_add_tail(&info->list, &ep->rma_info.vma_list); + spin_unlock(&ep->lock); +done: + return err; +} + +/* Delete the VMA from the per endpoint VMA list */ +static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma) +{ + struct list_head *item; + struct scif_vma_info *info; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.vma_list) { + info = list_entry(item, struct scif_vma_info, list); + if (info->vma == vma) { + list_del(&info->list); + kfree(info); + break; + } + } + spin_unlock(&ep->lock); +} + +static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep) +{ + struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev; + struct scif_hw_dev *sdev = scifdev->sdev; + phys_addr_t out_phys, apt_base = 0; + + /* + * If the DMA address is card relative then we need to add the + * aperture base for mmap to work correctly + */ + if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da) + apt_base = sdev->aper->pa; + out_phys = apt_base + phys; + return out_phys; +} + +int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, + struct scif_range **pages) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_rma_req req; + struct scif_window *window = NULL; + int nr_pages, err, i; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n", + ep, offset, len); + err = scif_verify_epd(ep); + if (err) + return err; + + if (!len || (offset < 0) || + (offset + len < offset) || + (ALIGN(offset, PAGE_SIZE) != offset) || + (ALIGN(len, PAGE_SIZE) != len)) + return -EINVAL; + + nr_pages = len >> PAGE_SHIFT; + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = SCIF_WINDOW_SINGLE; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + + /* Allocate scif_range */ + *pages = kzalloc(sizeof(**pages), GFP_KERNEL); + if (!*pages) { + err = -ENOMEM; + goto error; + } + + /* Allocate phys addr array */ + (*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)); + if (!((*pages)->phys_addr)) { + err = -ENOMEM; + goto error; + } + + if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) { + /* Allocate virtual address array */ + ((*pages)->va = scif_zalloc(nr_pages * sizeof(void *))); + if (!(*pages)->va) { + err = -ENOMEM; + goto error; + } + } + /* Populate the values */ + (*pages)->cookie = window; + (*pages)->nr_pages = nr_pages; + (*pages)->prot_flags = window->prot; + + for (i = 0; i < nr_pages; i++) { + (*pages)->phys_addr[i] = + __scif_off_to_dma_addr(window, offset + + (i * PAGE_SIZE)); + (*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i], + ep); + if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) + (*pages)->va[i] = + ep->remote_dev->sdev->aper->va + + (*pages)->phys_addr[i] - + ep->remote_dev->sdev->aper->pa; + } + + scif_get_window(window, nr_pages); +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (err) { + if (*pages) { + scif_free((*pages)->phys_addr, + nr_pages * sizeof(dma_addr_t)); + scif_free((*pages)->va, + nr_pages * sizeof(void *)); + kfree(*pages); + *pages = NULL; + } + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + } + return err; +} +EXPORT_SYMBOL_GPL(scif_get_pages); + +int scif_put_pages(struct scif_range *pages) +{ + struct scif_endpt *ep; + struct scif_window *window; + struct scifmsg msg; + + if (!pages || !pages->cookie) + return -EINVAL; + + window = pages->cookie; + + if (!window || window->magic != SCIFEP_MAGIC) + return -EINVAL; + + ep = (struct scif_endpt *)window->ep; + /* + * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the + * callee should be allowed to release references to the pages, + * else the endpoint was not connected in the first place, + * hence the ENOTCONN. + */ + if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED) + return -ENOTCONN; + + mutex_lock(&ep->rma_info.rma_lock); + + scif_put_window(window, pages->nr_pages); + + /* Initiate window destruction if ref count is zero */ + if (!window->ref_count) { + list_del(&window->list); + mutex_unlock(&ep->rma_info.rma_lock); + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + /* Inform the peer about this window being destroyed. */ + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + msg.payload[0] = window->peer_window; + /* No error handling for notification messages */ + scif_nodeqp_send(ep->remote_dev, &msg); + /* Destroy this window from the peer's registered AS */ + scif_destroy_remote_window(window); + } else { + mutex_unlock(&ep->rma_info.rma_lock); + } + + scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t)); + scif_free(pages->va, pages->nr_pages * sizeof(void *)); + kfree(pages); + return 0; +} +EXPORT_SYMBOL_GPL(scif_put_pages); + +/* + * scif_rma_list_mmap: + * + * Traverse the remote registration list starting from start_window: + * 1) Create VtoP mappings via remap_pfn_range(..) + * 2) Once step 1) and 2) complete successfully then traverse the range of + * windows again and bump the reference count. + * RMA lock must be held. + */ +static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset, + int nr_pages, struct vm_area_struct *vma) +{ + s64 end_offset, loop_offset = offset; + struct scif_window *window = start_window; + int loop_nr_pages, nr_pages_left = nr_pages; + struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; + struct list_head *head = &ep->rma_info.remote_reg_list; + int i, err = 0; + dma_addr_t phys_addr; + struct scif_window_iter src_win_iter; + size_t contig_bytes = 0; + + might_sleep(); + list_for_each_entry_from(window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_init_window_iter(window, &src_win_iter); + for (i = 0; i < loop_nr_pages; i++) { + phys_addr = scif_off_to_dma_addr(window, loop_offset, + &contig_bytes, + &src_win_iter); + phys_addr = scif_get_phys(phys_addr, ep); + err = remap_pfn_range(vma, + vma->vm_start + + loop_offset - offset, + phys_addr >> PAGE_SHIFT, + PAGE_SIZE, + vma->vm_page_prot); + if (err) + goto error; + loop_offset += PAGE_SIZE; + } + nr_pages_left -= loop_nr_pages; + if (!nr_pages_left) + break; + } + /* + * No more failures expected. Bump up the ref count for all + * the windows. Another traversal from start_window required + * for handling errors encountered across windows during + * remap_pfn_range(..). + */ + loop_offset = offset; + nr_pages_left = nr_pages; + window = start_window; + head = &ep->rma_info.remote_reg_list; + list_for_each_entry_from(window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_get_window(window, loop_nr_pages); + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } +error: + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} + +/* + * scif_rma_list_munmap: + * + * Traverse the remote registration list starting from window: + * 1) Decrement ref count. + * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer. + * RMA lock must be held. + */ +static void scif_rma_list_munmap(struct scif_window *start_window, + s64 offset, int nr_pages) +{ + struct scifmsg msg; + s64 loop_offset = offset, end_offset; + int loop_nr_pages, nr_pages_left = nr_pages; + struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; + struct list_head *head = &ep->rma_info.remote_reg_list; + struct scif_window *window = start_window, *_window; + + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + loop_offset = offset; + nr_pages_left = nr_pages; + list_for_each_entry_safe_from(window, _window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_put_window(window, loop_nr_pages); + if (!window->ref_count) { + struct scif_dev *rdev = ep->remote_dev; + + scif_drain_dma_intr(rdev->sdev, + ep->rma_info.dma_chan); + /* Inform the peer about this munmap */ + msg.payload[0] = window->peer_window; + /* No error handling for Notification messages. */ + scif_nodeqp_send(ep->remote_dev, &msg); + list_del(&window->list); + /* Destroy this window from the peer's registered AS */ + scif_destroy_remote_window(window); + } + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } +} + +/* + * The private data field of each VMA used to mmap a remote window + * points to an instance of struct vma_pvt + */ +struct vma_pvt { + struct scif_endpt *ep; /* End point for remote window */ + s64 offset; /* offset within remote window */ + bool valid_offset; /* offset is valid only if the original + * mmap request was for a single page + * else the offset within the vma is + * the correct offset + */ + struct kref ref; +}; + +static void vma_pvt_release(struct kref *ref) +{ + struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref); + + kfree(vmapvt); +} + +/** + * scif_vma_open - VMA open driver callback + * @vma: VMM memory area. + * The open method is called by the kernel to allow the subsystem implementing + * the VMA to initialize the area. This method is invoked any time a new + * reference to the VMA is made (when a process forks, for example). + * The one exception happens when the VMA is first created by mmap; + * in this case, the driver's mmap method is called instead. + * This function is also invoked when an existing VMA is split by the kernel + * due to a call to munmap on a subset of the VMA resulting in two VMAs. + * The kernel invokes this function only on one of the two VMAs. + */ +static void scif_vma_open(struct vm_area_struct *vma) +{ + struct vma_pvt *vmapvt = vma->vm_private_data; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n", + vma->vm_start, vma->vm_end); + scif_insert_vma(vmapvt->ep, vma); + kref_get(&vmapvt->ref); +} + +/** + * scif_munmap - VMA close driver callback. + * @vma: VMM memory area. + * When an area is destroyed, the kernel calls its close operation. + * Note that there's no usage count associated with VMA's; the area + * is opened and closed exactly once by each process that uses it. + */ +static void scif_munmap(struct vm_area_struct *vma) +{ + struct scif_endpt *ep; + struct vma_pvt *vmapvt = vma->vm_private_data; + int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + s64 offset; + struct scif_rma_req req; + struct scif_window *window = NULL; + int err; + + might_sleep(); + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n", + vma->vm_start, vma->vm_end); + ep = vmapvt->ep; + offset = vmapvt->valid_offset ? vmapvt->offset : + (vma->vm_pgoff) << PAGE_SHIFT; + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n", + ep, nr_pages, offset); + req.out_window = &window; + req.offset = offset; + req.nr_bytes = vma->vm_end - vma->vm_start; + req.prot = vma->vm_flags & (VM_READ | VM_WRITE); + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + + err = scif_query_window(&req); + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + else + scif_rma_list_munmap(window, offset, nr_pages); + + mutex_unlock(&ep->rma_info.rma_lock); + /* + * The kernel probably zeroes these out but we still want + * to clean up our own mess just in case. + */ + vma->vm_ops = NULL; + vma->vm_private_data = NULL; + kref_put(&vmapvt->ref, vma_pvt_release); + scif_delete_vma(ep, vma); +} + +static const struct vm_operations_struct scif_vm_ops = { + .open = scif_vma_open, + .close = scif_munmap, +}; + +/** + * scif_mmap - Map pages in virtual address space to a remote window. + * @vma: VMM memory area. + * @epd: endpoint descriptor + * + * Return: Upon successful completion, scif_mmap() returns zero + * else an apt error is returned as documented in scif.h + */ +int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 start_offset = vma->vm_pgoff << PAGE_SHIFT; + int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int err; + struct vma_pvt *vmapvt; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n", + ep, start_offset, nr_pages); + err = scif_verify_epd(ep); + if (err) + return err; + + might_sleep(); + + err = scif_insert_vma(ep, vma); + if (err) + return err; + + vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL); + if (!vmapvt) { + scif_delete_vma(ep, vma); + return -ENOMEM; + } + + vmapvt->ep = ep; + kref_init(&vmapvt->ref); + + req.out_window = &window; + req.offset = start_offset; + req.nr_bytes = vma->vm_end - vma->vm_start; + req.prot = vma->vm_flags & (VM_READ | VM_WRITE); + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unlock; + } + + /* Default prot for loopback */ + if (!scifdev_self(ep->remote_dev)) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + /* + * VM_DONTCOPY - Do not copy this vma on fork + * VM_DONTEXPAND - Cannot expand with mremap() + * VM_RESERVED - Count as reserved_vm like IO + * VM_PFNMAP - Page-ranges managed without "struct page" + * VM_IO - Memory mapped I/O or similar + * + * We do not want to copy this VMA automatically on a fork(), + * expand this VMA due to mremap() or swap out these pages since + * the VMA is actually backed by physical pages in the remote + * node's physical memory and not via a struct page. + */ + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + + if (!scifdev_self(ep->remote_dev)) + vma->vm_flags |= VM_IO | VM_PFNMAP; + + /* Map this range of windows */ + err = scif_rma_list_mmap(window, start_offset, nr_pages, vma); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unlock; + } + /* Set up the driver call back */ + vma->vm_ops = &scif_vm_ops; + vma->vm_private_data = vmapvt; +error_unlock: + mutex_unlock(&ep->rma_info.rma_lock); + if (err) { + kfree(vmapvt); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + scif_delete_vma(ep, vma); + } + return err; +} diff --git a/drivers/misc/mic/scif/scif_nm.c b/drivers/misc/mic/scif/scif_nm.c index 9b4c5382d..79f26a02a 100644 --- a/drivers/misc/mic/scif/scif_nm.c +++ b/drivers/misc/mic/scif/scif_nm.c @@ -34,6 +34,7 @@ static void scif_invalidate_ep(int node) list_for_each_safe(pos, tmpq, &scif_info.disconnected) { ep = list_entry(pos, struct scif_endpt, list); if (ep->remote_dev->node == node) { + scif_unmap_all_windows(ep); spin_lock(&ep->lock); scif_cleanup_ep_qp(ep); spin_unlock(&ep->lock); @@ -50,6 +51,7 @@ static void scif_invalidate_ep(int node) wake_up_interruptible(&ep->sendwq); wake_up_interruptible(&ep->recvwq); spin_unlock(&ep->lock); + scif_unmap_all_windows(ep); } } mutex_unlock(&scif_info.connlock); @@ -61,8 +63,8 @@ void scif_free_qp(struct scif_dev *scifdev) if (!qp) return; - scif_free_coherent((void *)qp->inbound_q.rb_base, - qp->local_buf, scifdev, qp->inbound_q.size); + scif_unmap_single(qp->local_buf, scifdev, qp->inbound_q.size); + kfree(qp->inbound_q.rb_base); scif_unmap_single(qp->local_qp, scifdev, sizeof(struct scif_qp)); kfree(scifdev->qpairs); scifdev->qpairs = NULL; @@ -125,8 +127,12 @@ void scif_cleanup_scifdev(struct scif_dev *dev) } scif_destroy_intr_wq(dev); } + flush_work(&scif_info.misc_work); scif_destroy_p2p(dev); scif_invalidate_ep(dev->node); + scif_zap_mmaps(dev->node); + scif_cleanup_rma_for_zombies(dev->node); + flush_work(&scif_info.misc_work); scif_send_acks(dev); if (!dev->node && scif_info.card_initiated_exit) { /* @@ -147,14 +153,8 @@ void scif_cleanup_scifdev(struct scif_dev *dev) void scif_handle_remove_node(int node) { struct scif_dev *scifdev = &scif_dev[node]; - struct scif_peer_dev *spdev; - - rcu_read_lock(); - spdev = rcu_dereference(scifdev->spdev); - rcu_read_unlock(); - if (spdev) - scif_peer_unregister_device(spdev); - else + + if (scif_peer_unregister_device(scifdev)) scif_send_acks(scifdev); } diff --git a/drivers/misc/mic/scif/scif_nodeqp.c b/drivers/misc/mic/scif/scif_nodeqp.c index 6dfdae345..c66ca1a58 100644 --- a/drivers/misc/mic/scif/scif_nodeqp.c +++ b/drivers/misc/mic/scif/scif_nodeqp.c @@ -105,18 +105,22 @@ int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset, int local_size, struct scif_dev *scifdev) { - void *local_q = NULL; + void *local_q = qp->inbound_q.rb_base; int err = 0; u32 tmp_rd = 0; spin_lock_init(&qp->send_lock); spin_lock_init(&qp->recv_lock); - local_q = kzalloc(local_size, GFP_KERNEL); + /* Allocate rb only if not already allocated */ if (!local_q) { - err = -ENOMEM; - return err; + local_q = kzalloc(local_size, GFP_KERNEL); + if (!local_q) { + err = -ENOMEM; + return err; + } } + err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size); if (err) goto kfree; @@ -260,6 +264,11 @@ int scif_setup_qp_connect_response(struct scif_dev *scifdev, r_buf, get_count_order(remote_size)); /* + * Because the node QP may already be processing an INIT message, set + * the read pointer so the cached read offset isn't lost + */ + qp->remote_qp->local_read = qp->inbound_q.current_read_offset; + /* * resetup the inbound_q now that we know where the * inbound_read really is. */ @@ -426,6 +435,21 @@ free_p2p: return NULL; } +/* Uninitialize and release resources from a p2p mapping */ +static void scif_deinit_p2p_info(struct scif_dev *scifdev, + struct scif_p2p_info *p2p) +{ + struct scif_hw_dev *sdev = scifdev->sdev; + + dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], + p2p->sg_nentries[SCIF_PPI_MMIO], DMA_BIDIRECTIONAL); + dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], + p2p->sg_nentries[SCIF_PPI_APER], DMA_BIDIRECTIONAL); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); + kfree(p2p); +} + /** * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message * @dst: Destination node @@ -468,8 +492,10 @@ static void scif_node_connect(struct scif_dev *scifdev, int dst) if (!p2p_ij) return; p2p_ji = scif_init_p2p_info(dev_j, dev_i); - if (!p2p_ji) + if (!p2p_ji) { + scif_deinit_p2p_info(dev_i, p2p_ij); return; + } list_add_tail(&p2p_ij->ppi_list, &dev_i->p2p); list_add_tail(&p2p_ji->ppi_list, &dev_j->p2p); @@ -529,27 +555,6 @@ static void scif_p2p_setup(void) } } -void scif_qp_response_ack(struct work_struct *work) -{ - struct scif_dev *scifdev = container_of(work, struct scif_dev, - init_msg_work); - struct scif_peer_dev *spdev; - - /* Drop the INIT message if it has already been received */ - if (_scifdev_alive(scifdev)) - return; - - spdev = scif_peer_register_device(scifdev); - if (IS_ERR(spdev)) - return; - - if (scif_is_mgmt_node()) { - mutex_lock(&scif_info.conflock); - scif_p2p_setup(); - mutex_unlock(&scif_info.conflock); - } -} - static char *message_types[] = {"BAD", "INIT", "EXIT", @@ -568,7 +573,29 @@ static char *message_types[] = {"BAD", "DISCNT_ACK", "CLIENT_SENT", "CLIENT_RCVD", - "SCIF_GET_NODE_INFO"}; + "SCIF_GET_NODE_INFO", + "REGISTER", + "REGISTER_ACK", + "REGISTER_NACK", + "UNREGISTER", + "UNREGISTER_ACK", + "UNREGISTER_NACK", + "ALLOC_REQ", + "ALLOC_GNT", + "ALLOC_REJ", + "FREE_PHYS", + "FREE_VIRT", + "MUNMAP", + "MARK", + "MARK_ACK", + "MARK_NACK", + "WAIT", + "WAIT_ACK", + "WAIT_NACK", + "SIGNAL_LOCAL", + "SIGNAL_REMOTE", + "SIG_ACK", + "SIG_NACK"}; static void scif_display_message(struct scif_dev *scifdev, struct scifmsg *msg, @@ -662,10 +689,16 @@ int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg) * * Work queue handler for servicing miscellaneous SCIF tasks. * Examples include: - * 1) Cleanup of zombie endpoints. + * 1) Remote fence requests. + * 2) Destruction of temporary registered windows + * created during scif_vreadfrom()/scif_vwriteto(). + * 3) Cleanup of zombie endpoints. */ void scif_misc_handler(struct work_struct *work) { + scif_rma_handle_remote_fences(); + scif_rma_destroy_windows(); + scif_rma_destroy_tcw_invalid(); scif_cleanup_zombie_epd(); } @@ -682,13 +715,14 @@ scif_init(struct scif_dev *scifdev, struct scifmsg *msg) * address to complete initializing the inbound_q. */ flush_delayed_work(&scifdev->qp_dwork); - /* - * Delegate the peer device registration to a workqueue, otherwise if - * SCIF client probe (called during peer device registration) calls - * scif_connect(..), it will block the message processing thread causing - * a deadlock. - */ - schedule_work(&scifdev->init_msg_work); + + scif_peer_register_device(scifdev); + + if (scif_is_mgmt_node()) { + mutex_lock(&scif_info.conflock); + scif_p2p_setup(); + mutex_unlock(&scif_info.conflock); + } } /** @@ -838,13 +872,13 @@ void scif_poll_qp_state(struct work_struct *work) msecs_to_jiffies(SCIF_NODE_QP_TIMEOUT)); return; } - scif_peer_register_device(peerdev); return; timeout: dev_err(&peerdev->sdev->dev, "%s %d remote node %d offline, state = 0x%x\n", __func__, __LINE__, peerdev->node, qp->qp_state); qp->remote_qp->qp_state = SCIF_QP_OFFLINE; + scif_peer_unregister_device(peerdev); scif_cleanup_scifdev(peerdev); } @@ -894,6 +928,9 @@ scif_node_add_ack(struct scif_dev *scifdev, struct scifmsg *msg) goto local_error; peerdev->rdb = msg->payload[2]; qp->remote_qp->qp_state = SCIF_QP_ONLINE; + + scif_peer_register_device(peerdev); + schedule_delayed_work(&peerdev->p2p_dwork, 0); return; local_error: @@ -1007,6 +1044,27 @@ static void (*scif_intr_func[SCIF_MAX_MSG + 1]) scif_clientsend, /* SCIF_CLIENT_SENT */ scif_clientrcvd, /* SCIF_CLIENT_RCVD */ scif_get_node_info_resp,/* SCIF_GET_NODE_INFO */ + scif_recv_reg, /* SCIF_REGISTER */ + scif_recv_reg_ack, /* SCIF_REGISTER_ACK */ + scif_recv_reg_nack, /* SCIF_REGISTER_NACK */ + scif_recv_unreg, /* SCIF_UNREGISTER */ + scif_recv_unreg_ack, /* SCIF_UNREGISTER_ACK */ + scif_recv_unreg_nack, /* SCIF_UNREGISTER_NACK */ + scif_alloc_req, /* SCIF_ALLOC_REQ */ + scif_alloc_gnt_rej, /* SCIF_ALLOC_GNT */ + scif_alloc_gnt_rej, /* SCIF_ALLOC_REJ */ + scif_free_virt, /* SCIF_FREE_VIRT */ + scif_recv_munmap, /* SCIF_MUNMAP */ + scif_recv_mark, /* SCIF_MARK */ + scif_recv_mark_resp, /* SCIF_MARK_ACK */ + scif_recv_mark_resp, /* SCIF_MARK_NACK */ + scif_recv_wait, /* SCIF_WAIT */ + scif_recv_wait_resp, /* SCIF_WAIT_ACK */ + scif_recv_wait_resp, /* SCIF_WAIT_NACK */ + scif_recv_sig_local, /* SCIF_SIG_LOCAL */ + scif_recv_sig_remote, /* SCIF_SIG_REMOTE */ + scif_recv_sig_resp, /* SCIF_SIG_ACK */ + scif_recv_sig_resp, /* SCIF_SIG_NACK */ }; /** @@ -1169,7 +1227,6 @@ int scif_setup_loopback_qp(struct scif_dev *scifdev) int err = 0; void *local_q; struct scif_qp *qp; - struct scif_peer_dev *spdev; err = scif_setup_intr_wq(scifdev); if (err) @@ -1216,15 +1273,11 @@ int scif_setup_loopback_qp(struct scif_dev *scifdev) &qp->local_write, local_q, get_count_order(SCIF_NODE_QP_SIZE)); scif_info.nodeid = scifdev->node; - spdev = scif_peer_register_device(scifdev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - goto free_local_q; - } + + scif_peer_register_device(scifdev); + scif_info.loopb_dev = scifdev; return err; -free_local_q: - kfree(local_q); free_qpairs: kfree(scifdev->qpairs); destroy_loopb_wq: @@ -1243,13 +1296,7 @@ exit: */ int scif_destroy_loopback_qp(struct scif_dev *scifdev) { - struct scif_peer_dev *spdev; - - rcu_read_lock(); - spdev = rcu_dereference(scifdev->spdev); - rcu_read_unlock(); - if (spdev) - scif_peer_unregister_device(spdev); + scif_peer_unregister_device(scifdev); destroy_workqueue(scif_info.loopb_wq); scif_destroy_intr_wq(scifdev); kfree(scifdev->qpairs->outbound_q.rb_base); diff --git a/drivers/misc/mic/scif/scif_nodeqp.h b/drivers/misc/mic/scif/scif_nodeqp.h index 6c0ed6783..958962731 100644 --- a/drivers/misc/mic/scif/scif_nodeqp.h +++ b/drivers/misc/mic/scif/scif_nodeqp.h @@ -74,7 +74,28 @@ #define SCIF_CLIENT_SENT 16 /* Notify the peer that data has been written */ #define SCIF_CLIENT_RCVD 17 /* Notify the peer that data has been read */ #define SCIF_GET_NODE_INFO 18 /* Get current node mask from the mgmt node*/ -#define SCIF_MAX_MSG SCIF_GET_NODE_INFO +#define SCIF_REGISTER 19 /* Tell peer about a new registered window */ +#define SCIF_REGISTER_ACK 20 /* Notify peer about unregistration success */ +#define SCIF_REGISTER_NACK 21 /* Notify peer about registration success */ +#define SCIF_UNREGISTER 22 /* Tell peer about unregistering a window */ +#define SCIF_UNREGISTER_ACK 23 /* Notify peer about registration failure */ +#define SCIF_UNREGISTER_NACK 24 /* Notify peer about unregistration failure */ +#define SCIF_ALLOC_REQ 25 /* Request a mapped buffer */ +#define SCIF_ALLOC_GNT 26 /* Notify peer about allocation success */ +#define SCIF_ALLOC_REJ 27 /* Notify peer about allocation failure */ +#define SCIF_FREE_VIRT 28 /* Free previously allocated virtual memory */ +#define SCIF_MUNMAP 29 /* Acknowledgment for a SCIF_MMAP request */ +#define SCIF_MARK 30 /* SCIF Remote Fence Mark Request */ +#define SCIF_MARK_ACK 31 /* SCIF Remote Fence Mark Success */ +#define SCIF_MARK_NACK 32 /* SCIF Remote Fence Mark Failure */ +#define SCIF_WAIT 33 /* SCIF Remote Fence Wait Request */ +#define SCIF_WAIT_ACK 34 /* SCIF Remote Fence Wait Success */ +#define SCIF_WAIT_NACK 35 /* SCIF Remote Fence Wait Failure */ +#define SCIF_SIG_LOCAL 36 /* SCIF Remote Fence Local Signal Request */ +#define SCIF_SIG_REMOTE 37 /* SCIF Remote Fence Remote Signal Request */ +#define SCIF_SIG_ACK 38 /* SCIF Remote Fence Remote Signal Success */ +#define SCIF_SIG_NACK 39 /* SCIF Remote Fence Remote Signal Failure */ +#define SCIF_MAX_MSG SCIF_SIG_NACK /* * struct scifmsg - Node QP message format @@ -92,6 +113,24 @@ struct scifmsg { } __packed; /* + * struct scif_allocmsg - Used with SCIF_ALLOC_REQ to request + * the remote note to allocate memory + * + * phys_addr: Physical address of the buffer + * vaddr: Virtual address of the buffer + * size: Size of the buffer + * state: Current state + * allocwq: wait queue for status + */ +struct scif_allocmsg { + dma_addr_t phys_addr; + unsigned long vaddr; + size_t size; + enum scif_msg_state state; + wait_queue_head_t allocwq; +}; + +/* * struct scif_qp - Node Queue Pair * * Interesting structure -- a little difficult because we can only @@ -158,7 +197,6 @@ int scif_setup_qp_connect_response(struct scif_dev *scifdev, int scif_setup_loopback_qp(struct scif_dev *scifdev); int scif_destroy_loopback_qp(struct scif_dev *scifdev); void scif_poll_qp_state(struct work_struct *work); -void scif_qp_response_ack(struct work_struct *work); void scif_destroy_p2p(struct scif_dev *scifdev); void scif_send_exit(struct scif_dev *scifdev); static inline struct device *scif_get_peer_dev(struct scif_dev *scifdev) diff --git a/drivers/misc/mic/scif/scif_peer_bus.c b/drivers/misc/mic/scif/scif_peer_bus.c index 589ae9ad2..6ffa3bdbd 100644 --- a/drivers/misc/mic/scif/scif_peer_bus.c +++ b/drivers/misc/mic/scif/scif_peer_bus.c @@ -24,93 +24,152 @@ dev_to_scif_peer(struct device *dev) return container_of(dev, struct scif_peer_dev, dev); } -static inline struct scif_peer_driver * -drv_to_scif_peer(struct device_driver *drv) -{ - return container_of(drv, struct scif_peer_driver, driver); -} +struct bus_type scif_peer_bus = { + .name = "scif_peer_bus", +}; -static int scif_peer_dev_match(struct device *dv, struct device_driver *dr) +static void scif_peer_release_dev(struct device *d) { - return !strncmp(dev_name(dv), dr->name, 4); + struct scif_peer_dev *sdev = dev_to_scif_peer(d); + struct scif_dev *scifdev = &scif_dev[sdev->dnode]; + + scif_cleanup_scifdev(scifdev); + kfree(sdev); } -static int scif_peer_dev_probe(struct device *d) +static int scif_peer_initialize_device(struct scif_dev *scifdev) { - struct scif_peer_dev *dev = dev_to_scif_peer(d); - struct scif_peer_driver *drv = drv_to_scif_peer(dev->dev.driver); + struct scif_peer_dev *spdev; + int ret; - return drv->probe(dev); -} + spdev = kzalloc(sizeof(*spdev), GFP_KERNEL); + if (!spdev) { + ret = -ENOMEM; + goto err; + } -static int scif_peer_dev_remove(struct device *d) -{ - struct scif_peer_dev *dev = dev_to_scif_peer(d); - struct scif_peer_driver *drv = drv_to_scif_peer(dev->dev.driver); + spdev->dev.parent = scifdev->sdev->dev.parent; + spdev->dev.release = scif_peer_release_dev; + spdev->dnode = scifdev->node; + spdev->dev.bus = &scif_peer_bus; + dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode); + + device_initialize(&spdev->dev); + get_device(&spdev->dev); + rcu_assign_pointer(scifdev->spdev, spdev); - drv->remove(dev); + mutex_lock(&scif_info.conflock); + scif_info.total++; + scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid); + mutex_unlock(&scif_info.conflock); return 0; +err: + dev_err(&scifdev->sdev->dev, + "dnode %d: initialize_device rc %d\n", scifdev->node, ret); + return ret; } -static struct bus_type scif_peer_bus = { - .name = "scif_peer_bus", - .match = scif_peer_dev_match, - .probe = scif_peer_dev_probe, - .remove = scif_peer_dev_remove, -}; - -int scif_peer_register_driver(struct scif_peer_driver *driver) +static int scif_peer_add_device(struct scif_dev *scifdev) { - driver->driver.bus = &scif_peer_bus; - return driver_register(&driver->driver); + struct scif_peer_dev *spdev = rcu_dereference(scifdev->spdev); + char pool_name[16]; + int ret; + + ret = device_add(&spdev->dev); + put_device(&spdev->dev); + if (ret) { + dev_err(&scifdev->sdev->dev, + "dnode %d: peer device_add failed\n", scifdev->node); + goto put_spdev; + } + + scnprintf(pool_name, sizeof(pool_name), "scif-%d", spdev->dnode); + scifdev->signal_pool = dmam_pool_create(pool_name, &scifdev->sdev->dev, + sizeof(struct scif_status), 1, + 0); + if (!scifdev->signal_pool) { + dev_err(&scifdev->sdev->dev, + "dnode %d: dmam_pool_create failed\n", scifdev->node); + ret = -ENOMEM; + goto del_spdev; + } + dev_dbg(&spdev->dev, "Added peer dnode %d\n", spdev->dnode); + return 0; +del_spdev: + device_del(&spdev->dev); +put_spdev: + RCU_INIT_POINTER(scifdev->spdev, NULL); + synchronize_rcu(); + put_device(&spdev->dev); + + mutex_lock(&scif_info.conflock); + scif_info.total--; + mutex_unlock(&scif_info.conflock); + return ret; } -void scif_peer_unregister_driver(struct scif_peer_driver *driver) +void scif_add_peer_device(struct work_struct *work) { - driver_unregister(&driver->driver); + struct scif_dev *scifdev = container_of(work, struct scif_dev, + peer_add_work); + + scif_peer_add_device(scifdev); } -static void scif_peer_release_dev(struct device *d) +/* + * Peer device registration is split into a device_initialize and a device_add. + * The reason for doing this is as follows: First, peer device registration + * itself cannot be done in the message processing thread and must be delegated + * to another workqueue, otherwise if SCIF client probe, called during peer + * device registration, calls scif_connect(..), it will block the message + * processing thread causing a deadlock. Next, device_initialize is done in the + * "top-half" message processing thread and device_add in the "bottom-half" + * workqueue. If this is not done, SCIF_CNCT_REQ message processing executing + * concurrently with SCIF_INIT message processing is unable to get a reference + * on the peer device, thereby failing the connect request. + */ +void scif_peer_register_device(struct scif_dev *scifdev) { - struct scif_peer_dev *sdev = dev_to_scif_peer(d); - struct scif_dev *scifdev = &scif_dev[sdev->dnode]; + int ret; - scif_cleanup_scifdev(scifdev); - kfree(sdev); + mutex_lock(&scifdev->lock); + ret = scif_peer_initialize_device(scifdev); + if (ret) + goto exit; + schedule_work(&scifdev->peer_add_work); +exit: + mutex_unlock(&scifdev->lock); } -struct scif_peer_dev * -scif_peer_register_device(struct scif_dev *scifdev) +int scif_peer_unregister_device(struct scif_dev *scifdev) { - int ret; struct scif_peer_dev *spdev; - spdev = kzalloc(sizeof(*spdev), GFP_KERNEL); - if (!spdev) - return ERR_PTR(-ENOMEM); - - spdev->dev.parent = scifdev->sdev->dev.parent; - spdev->dev.release = scif_peer_release_dev; - spdev->dnode = scifdev->node; - spdev->dev.bus = &scif_peer_bus; + mutex_lock(&scifdev->lock); + /* Flush work to ensure device register is complete */ + flush_work(&scifdev->peer_add_work); - dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode); /* - * device_register() causes the bus infrastructure to look for a - * matching driver. + * Continue holding scifdev->lock since theoretically unregister_device + * can be called simultaneously from multiple threads */ - ret = device_register(&spdev->dev); - if (ret) - goto free_spdev; - return spdev; -free_spdev: - kfree(spdev); - return ERR_PTR(ret); -} - -void scif_peer_unregister_device(struct scif_peer_dev *sdev) -{ - device_unregister(&sdev->dev); + spdev = rcu_dereference(scifdev->spdev); + if (!spdev) { + mutex_unlock(&scifdev->lock); + return -ENODEV; + } + + RCU_INIT_POINTER(scifdev->spdev, NULL); + synchronize_rcu(); + mutex_unlock(&scifdev->lock); + + dev_dbg(&spdev->dev, "Removing peer dnode %d\n", spdev->dnode); + device_unregister(&spdev->dev); + + mutex_lock(&scif_info.conflock); + scif_info.total--; + mutex_unlock(&scif_info.conflock); + return 0; } int scif_peer_bus_init(void) diff --git a/drivers/misc/mic/scif/scif_peer_bus.h b/drivers/misc/mic/scif/scif_peer_bus.h index 33f0dbb30..a3b8dd2ed 100644 --- a/drivers/misc/mic/scif/scif_peer_bus.h +++ b/drivers/misc/mic/scif/scif_peer_bus.h @@ -19,47 +19,13 @@ #include <linux/device.h> #include <linux/mic_common.h> - -/* - * Peer devices show up as PCIe devices for the mgmt node but not the cards. - * The mgmt node discovers all the cards on the PCIe bus and informs the other - * cards about their peers. Upon notification of a peer a node adds a peer - * device to the peer bus to maintain symmetry in the way devices are - * discovered across all nodes in the SCIF network. - */ -/** - * scif_peer_dev - representation of a peer SCIF device - * @dev: underlying device - * @dnode - The destination node which this device will communicate with. - */ -struct scif_peer_dev { - struct device dev; - u8 dnode; -}; - -/** - * scif_peer_driver - operations for a scif_peer I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct scif_peer_driver { - struct device_driver driver; - const struct scif_peer_dev_id *id_table; - - int (*probe)(struct scif_peer_dev *dev); - void (*remove)(struct scif_peer_dev *dev); -}; +#include <linux/scif.h> struct scif_dev; -int scif_peer_register_driver(struct scif_peer_driver *driver); -void scif_peer_unregister_driver(struct scif_peer_driver *driver); - -struct scif_peer_dev *scif_peer_register_device(struct scif_dev *sdev); -void scif_peer_unregister_device(struct scif_peer_dev *sdev); - +void scif_add_peer_device(struct work_struct *work); +void scif_peer_register_device(struct scif_dev *sdev); +int scif_peer_unregister_device(struct scif_dev *scifdev); int scif_peer_bus_init(void); void scif_peer_bus_exit(void); #endif /* _SCIF_PEER_BUS_H */ diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c new file mode 100644 index 000000000..8310b4dbf --- /dev/null +++ b/drivers/misc/mic/scif/scif_rma.c @@ -0,0 +1,1775 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/dma_remapping.h> +#include <linux/pagemap.h> +#include "scif_main.h" +#include "scif_map.h" + +/* Used to skip ulimit checks for registrations with SCIF_MAP_KERNEL flag */ +#define SCIF_MAP_ULIMIT 0x40 + +bool scif_ulimit_check = 1; + +/** + * scif_rma_ep_init: + * @ep: end point + * + * Initialize RMA per EP data structures. + */ +void scif_rma_ep_init(struct scif_endpt *ep) +{ + struct scif_endpt_rma_info *rma = &ep->rma_info; + + mutex_init(&rma->rma_lock); + init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN, + SCIF_DMA_64BIT_PFN); + spin_lock_init(&rma->tc_lock); + mutex_init(&rma->mmn_lock); + INIT_LIST_HEAD(&rma->reg_list); + INIT_LIST_HEAD(&rma->remote_reg_list); + atomic_set(&rma->tw_refcount, 0); + atomic_set(&rma->tcw_refcount, 0); + atomic_set(&rma->tcw_total_pages, 0); + atomic_set(&rma->fence_refcount, 0); + + rma->async_list_del = 0; + rma->dma_chan = NULL; + INIT_LIST_HEAD(&rma->mmn_list); + INIT_LIST_HEAD(&rma->vma_list); + init_waitqueue_head(&rma->markwq); +} + +/** + * scif_rma_ep_can_uninit: + * @ep: end point + * + * Returns 1 if an endpoint can be uninitialized and 0 otherwise. + */ +int scif_rma_ep_can_uninit(struct scif_endpt *ep) +{ + int ret = 0; + + mutex_lock(&ep->rma_info.rma_lock); + /* Destroy RMA Info only if both lists are empty */ + if (list_empty(&ep->rma_info.reg_list) && + list_empty(&ep->rma_info.remote_reg_list) && + list_empty(&ep->rma_info.mmn_list) && + !atomic_read(&ep->rma_info.tw_refcount) && + !atomic_read(&ep->rma_info.tcw_refcount) && + !atomic_read(&ep->rma_info.fence_refcount)) + ret = 1; + mutex_unlock(&ep->rma_info.rma_lock); + return ret; +} + +/** + * scif_create_pinned_pages: + * @nr_pages: number of pages in window + * @prot: read/write protection + * + * Allocate and prepare a set of pinned pages. + */ +static struct scif_pinned_pages * +scif_create_pinned_pages(int nr_pages, int prot) +{ + struct scif_pinned_pages *pin; + + might_sleep(); + pin = scif_zalloc(sizeof(*pin)); + if (!pin) + goto error; + + pin->pages = scif_zalloc(nr_pages * sizeof(*pin->pages)); + if (!pin->pages) + goto error_free_pinned_pages; + + pin->prot = prot; + pin->magic = SCIFEP_MAGIC; + return pin; + +error_free_pinned_pages: + scif_free(pin, sizeof(*pin)); +error: + return NULL; +} + +/** + * scif_destroy_pinned_pages: + * @pin: A set of pinned pages. + * + * Deallocate resources for pinned pages. + */ +static int scif_destroy_pinned_pages(struct scif_pinned_pages *pin) +{ + int j; + int writeable = pin->prot & SCIF_PROT_WRITE; + int kernel = SCIF_MAP_KERNEL & pin->map_flags; + + for (j = 0; j < pin->nr_pages; j++) { + if (pin->pages[j] && !kernel) { + if (writeable) + SetPageDirty(pin->pages[j]); + put_page(pin->pages[j]); + } + } + + scif_free(pin->pages, + pin->nr_pages * sizeof(*pin->pages)); + scif_free(pin, sizeof(*pin)); + return 0; +} + +/* + * scif_create_window: + * @ep: end point + * @nr_pages: number of pages + * @offset: registration offset + * @temp: true if a temporary window is being created + * + * Allocate and prepare a self registration window. + */ +struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, + s64 offset, bool temp) +{ + struct scif_window *window; + + might_sleep(); + window = scif_zalloc(sizeof(*window)); + if (!window) + goto error; + + window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); + if (!window->dma_addr) + goto error_free_window; + + window->num_pages = scif_zalloc(nr_pages * sizeof(*window->num_pages)); + if (!window->num_pages) + goto error_free_window; + + window->offset = offset; + window->ep = (u64)ep; + window->magic = SCIFEP_MAGIC; + window->reg_state = OP_IDLE; + init_waitqueue_head(&window->regwq); + window->unreg_state = OP_IDLE; + init_waitqueue_head(&window->unregwq); + INIT_LIST_HEAD(&window->list); + window->type = SCIF_WINDOW_SELF; + window->temp = temp; + return window; + +error_free_window: + scif_free(window->dma_addr, + nr_pages * sizeof(*window->dma_addr)); + scif_free(window, sizeof(*window)); +error: + return NULL; +} + +/** + * scif_destroy_incomplete_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +static void scif_destroy_incomplete_window(struct scif_endpt *ep, + struct scif_window *window) +{ + int err; + int nr_pages = window->nr_pages; + struct scif_allocmsg *alloc = &window->alloc_handle; + struct scifmsg msg; + +retry: + /* Wait for a SCIF_ALLOC_GNT/REJ message */ + err = wait_event_timeout(alloc->allocwq, + alloc->state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + + mutex_lock(&ep->rma_info.rma_lock); + if (alloc->state == OP_COMPLETED) { + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + msg.payload[3] = SCIF_REGISTER; + _scif_nodeqp_send(ep->remote_dev, &msg); + } + mutex_unlock(&ep->rma_info.rma_lock); + + scif_free_window_offset(ep, window, window->offset); + scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); + scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); + scif_free(window, sizeof(*window)); +} + +/** + * scif_unmap_window: + * @remote_dev: SCIF remote device + * @window: registration window + * + * Delete any DMA mappings created for a registered self window + */ +void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window) +{ + int j; + + if (scif_is_iommu_enabled() && !scifdev_self(remote_dev)) { + if (window->st) { + dma_unmap_sg(&remote_dev->sdev->dev, + window->st->sgl, window->st->nents, + DMA_BIDIRECTIONAL); + sg_free_table(window->st); + kfree(window->st); + window->st = NULL; + } + } else { + for (j = 0; j < window->nr_contig_chunks; j++) { + if (window->dma_addr[j]) { + scif_unmap_single(window->dma_addr[j], + remote_dev, + window->num_pages[j] << + PAGE_SHIFT); + window->dma_addr[j] = 0x0; + } + } + } +} + +static inline struct mm_struct *__scif_acquire_mm(void) +{ + if (scif_ulimit_check) + return get_task_mm(current); + return NULL; +} + +static inline void __scif_release_mm(struct mm_struct *mm) +{ + if (mm) + mmput(mm); +} + +static inline int +__scif_dec_pinned_vm_lock(struct mm_struct *mm, + int nr_pages, bool try_lock) +{ + if (!mm || !nr_pages || !scif_ulimit_check) + return 0; + if (try_lock) { + if (!down_write_trylock(&mm->mmap_sem)) { + dev_err(scif_info.mdev.this_device, + "%s %d err\n", __func__, __LINE__); + return -1; + } + } else { + down_write(&mm->mmap_sem); + } + mm->pinned_vm -= nr_pages; + up_write(&mm->mmap_sem); + return 0; +} + +static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, + int nr_pages) +{ + unsigned long locked, lock_limit; + + if (!mm || !nr_pages || !scif_ulimit_check) + return 0; + + locked = nr_pages; + locked += mm->pinned_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + dev_err(scif_info.mdev.this_device, + "locked(%lu) > lock_limit(%lu)\n", + locked, lock_limit); + return -ENOMEM; + } + mm->pinned_vm = locked; + return 0; +} + +/** + * scif_destroy_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window) +{ + int j; + struct scif_pinned_pages *pinned_pages = window->pinned_pages; + int nr_pages = window->nr_pages; + + might_sleep(); + if (!window->temp && window->mm) { + __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0); + __scif_release_mm(window->mm); + window->mm = NULL; + } + + scif_free_window_offset(ep, window, window->offset); + scif_unmap_window(ep->remote_dev, window); + /* + * Decrement references for this set of pinned pages from + * this window. + */ + j = atomic_sub_return(1, &pinned_pages->ref_count); + if (j < 0) + dev_err(scif_info.mdev.this_device, + "%s %d incorrect ref count %d\n", + __func__, __LINE__, j); + /* + * If the ref count for pinned_pages is zero then someone + * has already called scif_unpin_pages() for it and we should + * destroy the page cache. + */ + if (!j) + scif_destroy_pinned_pages(window->pinned_pages); + scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); + scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); + window->magic = 0; + scif_free(window, sizeof(*window)); + return 0; +} + +/** + * scif_create_remote_lookup: + * @remote_dev: SCIF remote device + * @window: remote window + * + * Allocate and prepare lookup entries for the remote + * end to copy over the physical addresses. + * Returns 0 on success and appropriate errno on failure. + */ +static int scif_create_remote_lookup(struct scif_dev *remote_dev, + struct scif_window *window) +{ + int i, j, err = 0; + int nr_pages = window->nr_pages; + bool vmalloc_dma_phys, vmalloc_num_pages; + + might_sleep(); + /* Map window */ + err = scif_map_single(&window->mapped_offset, + window, remote_dev, sizeof(*window)); + if (err) + goto error_window; + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE, + ((2) * 1024 * 1024)) >> 21; + + window->dma_addr_lookup.lookup = + scif_alloc_coherent(&window->dma_addr_lookup.offset, + remote_dev, window->nr_lookup * + sizeof(*window->dma_addr_lookup.lookup), + GFP_KERNEL | __GFP_ZERO); + if (!window->dma_addr_lookup.lookup) { + err = -ENOMEM; + goto error_window; + } + + window->num_pages_lookup.lookup = + scif_alloc_coherent(&window->num_pages_lookup.offset, + remote_dev, window->nr_lookup * + sizeof(*window->num_pages_lookup.lookup), + GFP_KERNEL | __GFP_ZERO); + if (!window->num_pages_lookup.lookup) { + err = -ENOMEM; + goto error_window; + } + + vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]); + vmalloc_num_pages = is_vmalloc_addr(&window->num_pages[0]); + + /* Now map each of the pages containing physical addresses */ + for (i = 0, j = 0; i < nr_pages; i += SCIF_NR_ADDR_IN_PAGE, j++) { + err = scif_map_page(&window->dma_addr_lookup.lookup[j], + vmalloc_dma_phys ? + vmalloc_to_page(&window->dma_addr[i]) : + virt_to_page(&window->dma_addr[i]), + remote_dev); + if (err) + goto error_window; + err = scif_map_page(&window->num_pages_lookup.lookup[j], + vmalloc_dma_phys ? + vmalloc_to_page(&window->num_pages[i]) : + virt_to_page(&window->num_pages[i]), + remote_dev); + if (err) + goto error_window; + } + return 0; +error_window: + return err; +} + +/** + * scif_destroy_remote_lookup: + * @remote_dev: SCIF remote device + * @window: remote window + * + * Destroy lookup entries used for the remote + * end to copy over the physical addresses. + */ +static void scif_destroy_remote_lookup(struct scif_dev *remote_dev, + struct scif_window *window) +{ + int i, j; + + if (window->nr_lookup) { + struct scif_rma_lookup *lup = &window->dma_addr_lookup; + struct scif_rma_lookup *npup = &window->num_pages_lookup; + + for (i = 0, j = 0; i < window->nr_pages; + i += SCIF_NR_ADDR_IN_PAGE, j++) { + if (lup->lookup && lup->lookup[j]) + scif_unmap_single(lup->lookup[j], + remote_dev, + PAGE_SIZE); + if (npup->lookup && npup->lookup[j]) + scif_unmap_single(npup->lookup[j], + remote_dev, + PAGE_SIZE); + } + if (lup->lookup) + scif_free_coherent(lup->lookup, lup->offset, + remote_dev, window->nr_lookup * + sizeof(*lup->lookup)); + if (npup->lookup) + scif_free_coherent(npup->lookup, npup->offset, + remote_dev, window->nr_lookup * + sizeof(*npup->lookup)); + if (window->mapped_offset) + scif_unmap_single(window->mapped_offset, + remote_dev, sizeof(*window)); + window->nr_lookup = 0; + } +} + +/** + * scif_create_remote_window: + * @ep: end point + * @nr_pages: number of pages in window + * + * Allocate and prepare a remote registration window. + */ +static struct scif_window * +scif_create_remote_window(struct scif_dev *scifdev, int nr_pages) +{ + struct scif_window *window; + + might_sleep(); + window = scif_zalloc(sizeof(*window)); + if (!window) + goto error_ret; + + window->magic = SCIFEP_MAGIC; + window->nr_pages = nr_pages; + + window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); + if (!window->dma_addr) + goto error_window; + + window->num_pages = scif_zalloc(nr_pages * + sizeof(*window->num_pages)); + if (!window->num_pages) + goto error_window; + + if (scif_create_remote_lookup(scifdev, window)) + goto error_window; + + window->type = SCIF_WINDOW_PEER; + window->unreg_state = OP_IDLE; + INIT_LIST_HEAD(&window->list); + return window; +error_window: + scif_destroy_remote_window(window); +error_ret: + return NULL; +} + +/** + * scif_destroy_remote_window: + * @ep: end point + * @window: remote registration window + * + * Deallocate resources for remote window. + */ +void +scif_destroy_remote_window(struct scif_window *window) +{ + scif_free(window->dma_addr, window->nr_pages * + sizeof(*window->dma_addr)); + scif_free(window->num_pages, window->nr_pages * + sizeof(*window->num_pages)); + window->magic = 0; + scif_free(window, sizeof(*window)); +} + +/** + * scif_iommu_map: create DMA mappings if the IOMMU is enabled + * @remote_dev: SCIF remote device + * @window: remote registration window + * + * Map the physical pages using dma_map_sg(..) and then detect the number + * of contiguous DMA mappings allocated + */ +static int scif_iommu_map(struct scif_dev *remote_dev, + struct scif_window *window) +{ + struct scatterlist *sg; + int i, err; + scif_pinned_pages_t pin = window->pinned_pages; + + window->st = kzalloc(sizeof(*window->st), GFP_KERNEL); + if (!window->st) + return -ENOMEM; + + err = sg_alloc_table(window->st, window->nr_pages, GFP_KERNEL); + if (err) + return err; + + for_each_sg(window->st->sgl, sg, window->st->nents, i) + sg_set_page(sg, pin->pages[i], PAGE_SIZE, 0x0); + + err = dma_map_sg(&remote_dev->sdev->dev, window->st->sgl, + window->st->nents, DMA_BIDIRECTIONAL); + if (!err) + return -ENOMEM; + /* Detect contiguous ranges of DMA mappings */ + sg = window->st->sgl; + for (i = 0; sg; i++) { + dma_addr_t last_da; + + window->dma_addr[i] = sg_dma_address(sg); + window->num_pages[i] = sg_dma_len(sg) >> PAGE_SHIFT; + last_da = sg_dma_address(sg) + sg_dma_len(sg); + while ((sg = sg_next(sg)) && sg_dma_address(sg) == last_da) { + window->num_pages[i] += + (sg_dma_len(sg) >> PAGE_SHIFT); + last_da = window->dma_addr[i] + + sg_dma_len(sg); + } + window->nr_contig_chunks++; + } + return 0; +} + +/** + * scif_map_window: + * @remote_dev: SCIF remote device + * @window: self registration window + * + * Map pages of a window into the aperture/PCI. + * Also determine addresses required for DMA. + */ +int +scif_map_window(struct scif_dev *remote_dev, struct scif_window *window) +{ + int i, j, k, err = 0, nr_contig_pages; + scif_pinned_pages_t pin; + phys_addr_t phys_prev, phys_curr; + + might_sleep(); + + pin = window->pinned_pages; + + if (intel_iommu_enabled && !scifdev_self(remote_dev)) + return scif_iommu_map(remote_dev, window); + + for (i = 0, j = 0; i < window->nr_pages; i += nr_contig_pages, j++) { + phys_prev = page_to_phys(pin->pages[i]); + nr_contig_pages = 1; + + /* Detect physically contiguous chunks */ + for (k = i + 1; k < window->nr_pages; k++) { + phys_curr = page_to_phys(pin->pages[k]); + if (phys_curr != (phys_prev + PAGE_SIZE)) + break; + phys_prev = phys_curr; + nr_contig_pages++; + } + window->num_pages[j] = nr_contig_pages; + window->nr_contig_chunks++; + if (scif_is_mgmt_node()) { + /* + * Management node has to deal with SMPT on X100 and + * hence the DMA mapping is required + */ + err = scif_map_single(&window->dma_addr[j], + phys_to_virt(page_to_phys( + pin->pages[i])), + remote_dev, + nr_contig_pages << PAGE_SHIFT); + if (err) + return err; + } else { + window->dma_addr[j] = page_to_phys(pin->pages[i]); + } + } + return err; +} + +/** + * scif_send_scif_unregister: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_UNREGISTER message. + */ +static int scif_send_scif_unregister(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + + msg.uop = SCIF_UNREGISTER; + msg.src = ep->port; + msg.payload[0] = window->alloc_handle.vaddr; + msg.payload[1] = (u64)window; + return scif_nodeqp_send(ep->remote_dev, &msg); +} + +/** + * scif_unregister_window: + * @window: self registration window + * + * Send an unregistration request and wait for a response. + */ +int scif_unregister_window(struct scif_window *window) +{ + int err = 0; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + bool send_msg = false; + + might_sleep(); + switch (window->unreg_state) { + case OP_IDLE: + { + window->unreg_state = OP_IN_PROGRESS; + send_msg = true; + /* fall through */ + } + case OP_IN_PROGRESS: + { + scif_get_window(window, 1); + mutex_unlock(&ep->rma_info.rma_lock); + if (send_msg) { + err = scif_send_scif_unregister(ep, window); + if (err) { + window->unreg_state = OP_COMPLETED; + goto done; + } + } else { + /* Return ENXIO since unregistration is in progress */ + mutex_lock(&ep->rma_info.rma_lock); + return -ENXIO; + } +retry: + /* Wait for a SCIF_UNREGISTER_(N)ACK message */ + err = wait_event_timeout(window->unregwq, + window->unreg_state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + } + if (err > 0) + err = 0; +done: + mutex_lock(&ep->rma_info.rma_lock); + scif_put_window(window, 1); + break; + } + case OP_FAILED: + { + if (!scifdev_alive(ep)) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + } + break; + } + case OP_COMPLETED: + break; + default: + err = -ENODEV; + } + + if (window->unreg_state == OP_COMPLETED && window->ref_count) + scif_put_window(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + list_del_init(&window->list); + scif_free_window_offset(ep, window, window->offset); + mutex_unlock(&ep->rma_info.rma_lock); + if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL)) && + scifdev_alive(ep)) { + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + } else { + if (!__scif_dec_pinned_vm_lock(window->mm, + window->nr_pages, 1)) { + __scif_release_mm(window->mm); + window->mm = NULL; + } + } + scif_queue_for_cleanup(window, &scif_info.rma); + mutex_lock(&ep->rma_info.rma_lock); + } + return err; +} + +/** + * scif_send_alloc_request: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request + */ +static int scif_send_alloc_request(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + struct scif_allocmsg *alloc = &window->alloc_handle; + + /* Set up the Alloc Handle */ + alloc->state = OP_IN_PROGRESS; + init_waitqueue_head(&alloc->allocwq); + + /* Send out an allocation request */ + msg.uop = SCIF_ALLOC_REQ; + msg.payload[1] = window->nr_pages; + msg.payload[2] = (u64)&window->alloc_handle; + return _scif_nodeqp_send(ep->remote_dev, &msg); +} + +/** + * scif_prep_remote_window: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request, wait for an allocation response, + * and prepares the remote window by copying over the page lists + */ +static int scif_prep_remote_window(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + struct scif_window *remote_window; + struct scif_allocmsg *alloc = &window->alloc_handle; + dma_addr_t *dma_phys_lookup, *tmp, *num_pages_lookup, *tmp1; + int i = 0, j = 0; + int nr_contig_chunks, loop_nr_contig_chunks; + int remaining_nr_contig_chunks, nr_lookup; + int err, map_err; + + map_err = scif_map_window(ep->remote_dev, window); + if (map_err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d map_err %d\n", __func__, __LINE__, map_err); + remaining_nr_contig_chunks = window->nr_contig_chunks; + nr_contig_chunks = window->nr_contig_chunks; +retry: + /* Wait for a SCIF_ALLOC_GNT/REJ message */ + err = wait_event_timeout(alloc->allocwq, + alloc->state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + mutex_lock(&ep->rma_info.rma_lock); + /* Synchronize with the thread waking up allocwq */ + mutex_unlock(&ep->rma_info.rma_lock); + if (!err && scifdev_alive(ep)) + goto retry; + + if (!err) + err = -ENODEV; + + if (err > 0) + err = 0; + else + return err; + + /* Bail out. The remote end rejected this request */ + if (alloc->state == OP_FAILED) + return -ENOMEM; + + if (map_err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, map_err); + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + msg.payload[3] = SCIF_REGISTER; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = _scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + return err; + } + + remote_window = scif_ioremap(alloc->phys_addr, sizeof(*window), + ep->remote_dev); + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + nr_lookup = ALIGN(nr_contig_chunks, SCIF_NR_ADDR_IN_PAGE) + >> ilog2(SCIF_NR_ADDR_IN_PAGE); + + dma_phys_lookup = + scif_ioremap(remote_window->dma_addr_lookup.offset, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + num_pages_lookup = + scif_ioremap(remote_window->num_pages_lookup.offset, + nr_lookup * + sizeof(*remote_window->num_pages_lookup.lookup), + ep->remote_dev); + + while (remaining_nr_contig_chunks) { + loop_nr_contig_chunks = min_t(int, remaining_nr_contig_chunks, + (int)SCIF_NR_ADDR_IN_PAGE); + /* #1/2 - Copy physical addresses over to the remote side */ + + /* #2/2 - Copy DMA addresses (addresses that are fed into the + * DMA engine) We transfer bus addresses which are then + * converted into a MIC physical address on the remote + * side if it is a MIC, if the remote node is a mgmt node we + * transfer the MIC physical address + */ + tmp = scif_ioremap(dma_phys_lookup[j], + loop_nr_contig_chunks * + sizeof(*window->dma_addr), + ep->remote_dev); + tmp1 = scif_ioremap(num_pages_lookup[j], + loop_nr_contig_chunks * + sizeof(*window->num_pages), + ep->remote_dev); + if (scif_is_mgmt_node()) { + memcpy_toio((void __force __iomem *)tmp, + &window->dma_addr[i], loop_nr_contig_chunks + * sizeof(*window->dma_addr)); + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], loop_nr_contig_chunks + * sizeof(*window->num_pages)); + } else { + if (scifdev_is_p2p(ep->remote_dev)) { + /* + * add remote node's base address for this node + * to convert it into a MIC address + */ + int m; + dma_addr_t dma_addr; + + for (m = 0; m < loop_nr_contig_chunks; m++) { + dma_addr = window->dma_addr[i + m] + + ep->remote_dev->base_addr; + writeq(dma_addr, + (void __force __iomem *)&tmp[m]); + } + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], + loop_nr_contig_chunks + * sizeof(*window->num_pages)); + } else { + /* Mgmt node or loopback - transfer DMA + * addresses as is, this is the same as a + * MIC physical address (we use the dma_addr + * and not the phys_addr array since the + * phys_addr is only setup if there is a mmap() + * request from the mgmt node) + */ + memcpy_toio((void __force __iomem *)tmp, + &window->dma_addr[i], + loop_nr_contig_chunks * + sizeof(*window->dma_addr)); + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], + loop_nr_contig_chunks * + sizeof(*window->num_pages)); + } + } + remaining_nr_contig_chunks -= loop_nr_contig_chunks; + i += loop_nr_contig_chunks; + j++; + scif_iounmap(tmp, loop_nr_contig_chunks * + sizeof(*window->dma_addr), ep->remote_dev); + scif_iounmap(tmp1, loop_nr_contig_chunks * + sizeof(*window->num_pages), ep->remote_dev); + } + + /* Prepare the remote window for the peer */ + remote_window->peer_window = (u64)window; + remote_window->offset = window->offset; + remote_window->prot = window->prot; + remote_window->nr_contig_chunks = nr_contig_chunks; + remote_window->ep = ep->remote_ep; + scif_iounmap(num_pages_lookup, + nr_lookup * + sizeof(*remote_window->num_pages_lookup.lookup), + ep->remote_dev); + scif_iounmap(dma_phys_lookup, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev); + window->peer_window = alloc->vaddr; + return err; +} + +/** + * scif_send_scif_register: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_REGISTER message if EP is connected and wait for a + * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT + * message so that the peer can free its remote window allocated earlier. + */ +static int scif_send_scif_register(struct scif_endpt *ep, + struct scif_window *window) +{ + int err = 0; + struct scifmsg msg; + + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) { + msg.uop = SCIF_REGISTER; + window->reg_state = OP_IN_PROGRESS; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + spin_unlock(&ep->lock); + if (!err) { +retry: + /* Wait for a SCIF_REGISTER_(N)ACK message */ + err = wait_event_timeout(window->regwq, + window->reg_state != + OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + err = !err ? -ENODEV : 0; + if (window->reg_state == OP_FAILED) + err = -ENOTCONN; + } + } else { + msg.uop = SCIF_FREE_VIRT; + msg.payload[3] = SCIF_REGISTER; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + spin_unlock(&ep->lock); + if (!err) + err = -ENOTCONN; + } + return err; +} + +/** + * scif_get_window_offset: + * @ep: end point descriptor + * @flags: flags + * @offset: offset hint + * @num_pages: number of pages + * @out_offset: computed offset returned by reference. + * + * Compute/Claim a new offset for this EP. + */ +int scif_get_window_offset(struct scif_endpt *ep, int flags, s64 offset, + int num_pages, s64 *out_offset) +{ + s64 page_index; + struct iova *iova_ptr; + int err = 0; + + if (flags & SCIF_MAP_FIXED) { + page_index = SCIF_IOVA_PFN(offset); + iova_ptr = reserve_iova(&ep->rma_info.iovad, page_index, + page_index + num_pages - 1); + if (!iova_ptr) + err = -EADDRINUSE; + } else { + iova_ptr = alloc_iova(&ep->rma_info.iovad, num_pages, + SCIF_DMA_63BIT_PFN - 1, 0); + if (!iova_ptr) + err = -ENOMEM; + } + if (!err) + *out_offset = (iova_ptr->pfn_lo) << PAGE_SHIFT; + return err; +} + +/** + * scif_free_window_offset: + * @ep: end point descriptor + * @window: registration window + * @offset: Offset to be freed + * + * Free offset for this EP. The callee is supposed to grab + * the RMA mutex before calling this API. + */ +void scif_free_window_offset(struct scif_endpt *ep, + struct scif_window *window, s64 offset) +{ + if ((window && !window->offset_freed) || !window) { + free_iova(&ep->rma_info.iovad, offset >> PAGE_SHIFT); + if (window) + window->offset_freed = true; + } +} + +/** + * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message + * @msg: Interrupt message + * + * Remote side is requesting a memory allocation. + */ +void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg) +{ + int err; + struct scif_window *window = NULL; + int nr_pages = msg->payload[1]; + + window = scif_create_remote_window(scifdev, nr_pages); + if (!window) { + err = -ENOMEM; + goto error; + } + + /* The peer's allocation request is granted */ + msg->uop = SCIF_ALLOC_GNT; + msg->payload[0] = (u64)window; + msg->payload[1] = window->mapped_offset; + err = scif_nodeqp_send(scifdev, msg); + if (err) + scif_destroy_remote_window(window); + return; +error: + /* The peer's allocation request is rejected */ + dev_err(&scifdev->sdev->dev, + "%s %d error %d alloc_ptr %p nr_pages 0x%x\n", + __func__, __LINE__, err, window, nr_pages); + msg->uop = SCIF_ALLOC_REJ; + scif_nodeqp_send(scifdev, msg); +} + +/** + * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message + * @msg: Interrupt message + * + * Remote side responded to a memory allocation. + */ +void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_allocmsg *handle = (struct scif_allocmsg *)msg->payload[2]; + struct scif_window *window = container_of(handle, struct scif_window, + alloc_handle); + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + handle->vaddr = msg->payload[0]; + handle->phys_addr = msg->payload[1]; + if (msg->uop == SCIF_ALLOC_GNT) + handle->state = OP_COMPLETED; + else + handle->state = OP_FAILED; + wake_up(&handle->allocwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_free_virt: Respond to SCIF_FREE_VIRT interrupt message + * @msg: Interrupt message + * + * Free up memory kmalloc'd earlier. + */ +void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = (struct scif_window *)msg->payload[1]; + + scif_destroy_remote_window(window); +} + +static void +scif_fixup_aper_base(struct scif_dev *dev, struct scif_window *window) +{ + int j; + struct scif_hw_dev *sdev = dev->sdev; + phys_addr_t apt_base = 0; + + /* + * Add the aperture base if the DMA address is not card relative + * since the DMA addresses need to be an offset into the bar + */ + if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && + sdev->aper && !sdev->card_rel_da) + apt_base = sdev->aper->pa; + else + return; + + for (j = 0; j < window->nr_contig_chunks; j++) { + if (window->num_pages[j]) + window->dma_addr[j] += apt_base; + else + break; + } +} + +/** + * scif_recv_reg: Respond to SCIF_REGISTER interrupt message + * @msg: Interrupt message + * + * Update remote window list with a new registered window. + */ +void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) { + msg->uop = SCIF_REGISTER_ACK; + scif_nodeqp_send(ep->remote_dev, msg); + scif_fixup_aper_base(ep->remote_dev, window); + /* No further failures expected. Insert new window */ + scif_insert_window(window, &ep->rma_info.remote_reg_list); + } else { + msg->uop = SCIF_REGISTER_NACK; + scif_nodeqp_send(ep->remote_dev, msg); + } + spin_unlock(&ep->lock); + mutex_unlock(&ep->rma_info.rma_lock); + /* free up any lookup resources now that page lists are transferred */ + scif_destroy_remote_lookup(ep->remote_dev, window); + /* + * We could not insert the window but we need to + * destroy the window. + */ + if (msg->uop == SCIF_REGISTER_NACK) + scif_destroy_remote_window(window); +} + +/** + * scif_recv_unreg: Respond to SCIF_UNREGISTER interrupt message + * @msg: Interrupt message + * + * Remove window from remote registration list; + */ +void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_window *recv_window = + (struct scif_window *)msg->payload[0]; + struct scif_endpt *ep; + int del_window = 0; + + ep = (struct scif_endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = 0; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.remote_reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if (scif_query_window(&req)) { + dev_err(&scifdev->sdev->dev, + "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + if (window) { + if (window->ref_count) + scif_put_window(window, window->nr_pages); + else + dev_err(&scifdev->sdev->dev, + "%s %d ref count should be +ve\n", + __func__, __LINE__); + window->unreg_state = OP_COMPLETED; + if (!window->ref_count) { + msg->uop = SCIF_UNREGISTER_ACK; + atomic_inc(&ep->rma_info.tw_refcount); + ep->rma_info.async_list_del = 1; + list_del_init(&window->list); + del_window = 1; + } else { + /* NACK! There are valid references to this window */ + msg->uop = SCIF_UNREGISTER_NACK; + } + } else { + /* The window did not make its way to the list at all. ACK */ + msg->uop = SCIF_UNREGISTER_ACK; + scif_destroy_remote_window(recv_window); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (del_window) + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + scif_nodeqp_send(ep->remote_dev, msg); + if (del_window) + scif_queue_for_cleanup(window, &scif_info.rma); +} + +/** + * scif_recv_reg_ack: Respond to SCIF_REGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete registration. + */ +void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[2]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->reg_state = OP_COMPLETED; + wake_up(&window->regwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_reg_nack: Respond to SCIF_REGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that registration + * cannot be completed. + */ +void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[2]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->reg_state = OP_FAILED; + wake_up(&window->regwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_unreg_ack: Respond to SCIF_UNREGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete unregistration. + */ +void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->unreg_state = OP_COMPLETED; + wake_up(&window->unregwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_unreg_nack: Respond to SCIF_UNREGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that unregistration + * cannot be completed immediately. + */ +void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->unreg_state = OP_FAILED; + wake_up(&window->unregwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +int __scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages) +{ + struct scif_pinned_pages *pinned_pages; + int nr_pages, err = 0, i; + bool vmalloc_addr = false; + bool try_upgrade = false; + int prot = *out_prot; + int ulimit = 0; + struct mm_struct *mm = NULL; + + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT)) + return -EINVAL; + ulimit = !!(map_flags & SCIF_MAP_ULIMIT); + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if (!len || + (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || + (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) + return -EINVAL; + + might_sleep(); + + nr_pages = len >> PAGE_SHIFT; + + /* Allocate a set of pinned pages */ + pinned_pages = scif_create_pinned_pages(nr_pages, prot); + if (!pinned_pages) + return -ENOMEM; + + if (map_flags & SCIF_MAP_KERNEL) { + if (is_vmalloc_addr(addr)) + vmalloc_addr = true; + + for (i = 0; i < nr_pages; i++) { + if (vmalloc_addr) + pinned_pages->pages[i] = + vmalloc_to_page(addr + (i * PAGE_SIZE)); + else + pinned_pages->pages[i] = + virt_to_page(addr + (i * PAGE_SIZE)); + } + pinned_pages->nr_pages = nr_pages; + pinned_pages->map_flags = SCIF_MAP_KERNEL; + } else { + /* + * SCIF supports registration caching. If a registration has + * been requested with read only permissions, then we try + * to pin the pages with RW permissions so that a subsequent + * transfer with RW permission can hit the cache instead of + * invalidating it. If the upgrade fails with RW then we + * revert back to R permission and retry + */ + if (prot == SCIF_PROT_READ) + try_upgrade = true; + prot |= SCIF_PROT_WRITE; +retry: + mm = current->mm; + down_write(&mm->mmap_sem); + if (ulimit) { + err = __scif_check_inc_pinned_vm(mm, nr_pages); + if (err) { + up_write(&mm->mmap_sem); + pinned_pages->nr_pages = 0; + goto error_unmap; + } + } + + pinned_pages->nr_pages = get_user_pages( + current, + mm, + (u64)addr, + nr_pages, + !!(prot & SCIF_PROT_WRITE), + 0, + pinned_pages->pages, + NULL); + up_write(&mm->mmap_sem); + if (nr_pages != pinned_pages->nr_pages) { + if (try_upgrade) { + if (ulimit) + __scif_dec_pinned_vm_lock(mm, + nr_pages, 0); + /* Roll back any pinned pages */ + for (i = 0; i < pinned_pages->nr_pages; i++) { + if (pinned_pages->pages[i]) + put_page( + pinned_pages->pages[i]); + } + prot &= ~SCIF_PROT_WRITE; + try_upgrade = false; + goto retry; + } + } + pinned_pages->map_flags = 0; + } + + if (pinned_pages->nr_pages < nr_pages) { + err = -EFAULT; + pinned_pages->nr_pages = nr_pages; + goto dec_pinned; + } + + *out_prot = prot; + atomic_set(&pinned_pages->ref_count, 1); + *pages = pinned_pages; + return err; +dec_pinned: + if (ulimit) + __scif_dec_pinned_vm_lock(mm, nr_pages, 0); + /* Something went wrong! Rollback */ +error_unmap: + pinned_pages->nr_pages = nr_pages; + scif_destroy_pinned_pages(pinned_pages); + *pages = NULL; + dev_dbg(scif_info.mdev.this_device, + "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len); + return err; +} + +int scif_pin_pages(void *addr, size_t len, int prot, + int map_flags, scif_pinned_pages_t *pages) +{ + return __scif_pin_pages(addr, len, &prot, map_flags, pages); +} +EXPORT_SYMBOL_GPL(scif_pin_pages); + +int scif_unpin_pages(scif_pinned_pages_t pinned_pages) +{ + int err = 0, ret; + + if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic) + return -EINVAL; + + ret = atomic_sub_return(1, &pinned_pages->ref_count); + if (ret < 0) { + dev_err(scif_info.mdev.this_device, + "%s %d scif_unpin_pages called without pinning? rc %d\n", + __func__, __LINE__, ret); + return -EINVAL; + } + /* + * Destroy the window if the ref count for this set of pinned + * pages has dropped to zero. If it is positive then there is + * a valid registered window which is backed by these pages and + * it will be destroyed once all such windows are unregistered. + */ + if (!ret) + err = scif_destroy_pinned_pages(pinned_pages); + + return err; +} +EXPORT_SYMBOL_GPL(scif_unpin_pages); + +static inline void +scif_insert_local_window(struct scif_window *window, struct scif_endpt *ep) +{ + mutex_lock(&ep->rma_info.rma_lock); + scif_insert_window(window, &ep->rma_info.reg_list); + mutex_unlock(&ep->rma_info.rma_lock); +} + +off_t scif_register_pinned_pages(scif_epd_t epd, + scif_pinned_pages_t pinned_pages, + off_t offset, int map_flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 computed_offset; + struct scif_window *window; + int err; + size_t len; + struct device *spdev; + + /* Unsupported flags */ + if (map_flags & ~SCIF_MAP_FIXED) + return -EINVAL; + + len = pinned_pages->nr_pages << PAGE_SHIFT; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + might_sleep(); + + err = scif_verify_epd(ep); + if (err) + return err; + /* + * It is an error to pass pinned_pages to scif_register_pinned_pages() + * after calling scif_unpin_pages(). + */ + if (!atomic_add_unless(&pinned_pages->ref_count, 1, 0)) + return -EINVAL; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, map_flags, offset, + len, &computed_offset); + if (err) { + atomic_sub(1, &pinned_pages->ref_count); + return err; + } + + /* Allocate and prepare self registration window */ + window = scif_create_window(ep, pinned_pages->nr_pages, + computed_offset, false); + if (!window) { + atomic_sub(1, &pinned_pages->ref_count); + scif_free_window_offset(ep, NULL, computed_offset); + return -ENOMEM; + } + + window->pinned_pages = pinned_pages; + window->nr_pages = pinned_pages->nr_pages; + window->prot = pinned_pages->prot; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + scif_destroy_window(ep, window); + return err; + } + err = scif_send_alloc_request(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Prepare the remote registration window */ + err = scif_prep_remote_window(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + err = scif_send_scif_register(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + scif_put_peer_dev(spdev); + /* No further failures expected. Insert new window */ + scif_insert_local_window(window, ep); + return computed_offset; +error_unmap: + scif_destroy_window(ep, window); + scif_put_peer_dev(spdev); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_register_pinned_pages); + +off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot, int map_flags) +{ + scif_pinned_pages_t pinned_pages; + off_t err; + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 computed_offset; + struct scif_window *window; + struct mm_struct *mm = NULL; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI register: ep %p addr %p len 0x%lx offset 0x%lx prot 0x%x map_flags 0x%x\n", + epd, addr, len, offset, prot, map_flags); + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL)) + return -EINVAL; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if (!len || (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || + (ALIGN(len, PAGE_SIZE) != len)) + return -EINVAL; + + might_sleep(); + + err = scif_verify_epd(ep); + if (err) + return err; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, map_flags, offset, + len >> PAGE_SHIFT, &computed_offset); + if (err) + return err; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + scif_free_window_offset(ep, NULL, computed_offset); + return err; + } + /* Allocate and prepare self registration window */ + window = scif_create_window(ep, len >> PAGE_SHIFT, + computed_offset, false); + if (!window) { + scif_free_window_offset(ep, NULL, computed_offset); + scif_put_peer_dev(spdev); + return -ENOMEM; + } + + window->nr_pages = len >> PAGE_SHIFT; + + err = scif_send_alloc_request(ep, window); + if (err) { + scif_destroy_incomplete_window(ep, window); + scif_put_peer_dev(spdev); + return err; + } + + if (!(map_flags & SCIF_MAP_KERNEL)) { + mm = __scif_acquire_mm(); + map_flags |= SCIF_MAP_ULIMIT; + } + /* Pin down the pages */ + err = __scif_pin_pages(addr, len, &prot, + map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT), + &pinned_pages); + if (err) { + scif_destroy_incomplete_window(ep, window); + __scif_release_mm(mm); + goto error; + } + + window->pinned_pages = pinned_pages; + window->prot = pinned_pages->prot; + window->mm = mm; + + /* Prepare the remote registration window */ + err = scif_prep_remote_window(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + err = scif_send_scif_register(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + scif_put_peer_dev(spdev); + /* No further failures expected. Insert new window */ + scif_insert_local_window(window, ep); + dev_dbg(&ep->remote_dev->sdev->dev, + "SCIFAPI register: ep %p addr %p len 0x%lx computed_offset 0x%llx\n", + epd, addr, len, computed_offset); + return computed_offset; +error_unmap: + scif_destroy_window(ep, window); +error: + scif_put_peer_dev(spdev); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_register); + +int +scif_unregister(scif_epd_t epd, off_t offset, size_t len) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_window *window = NULL; + struct scif_rma_req req; + int nr_pages, err; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI unregister: ep %p offset 0x%lx len 0x%lx\n", + ep, offset, len); + /* len must be page aligned. len should be non zero */ + if (!len || + (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) + return -EINVAL; + + /* Offset is not page aligned or offset+len wraps around */ + if ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset + (off_t)len < offset)) + return -EINVAL; + + err = scif_verify_epd(ep); + if (err) + return err; + + might_sleep(); + nr_pages = len >> PAGE_SHIFT; + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + /* Unregister all the windows in this range */ + err = scif_rma_list_unregister(window, offset, nr_pages); + if (err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); +error: + mutex_unlock(&ep->rma_info.rma_lock); + scif_put_peer_dev(spdev); + return err; +} +EXPORT_SYMBOL_GPL(scif_unregister); diff --git a/drivers/misc/mic/scif/scif_rma.h b/drivers/misc/mic/scif/scif_rma.h new file mode 100644 index 000000000..fa6722279 --- /dev/null +++ b/drivers/misc/mic/scif/scif_rma.h @@ -0,0 +1,464 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_RMA_H +#define SCIF_RMA_H + +#include <linux/dma_remapping.h> +#include <linux/mmu_notifier.h> + +#include "../bus/scif_bus.h" + +/* If this bit is set then the mark is a remote fence mark */ +#define SCIF_REMOTE_FENCE_BIT 31 +/* Magic value used to indicate a remote fence request */ +#define SCIF_REMOTE_FENCE BIT_ULL(SCIF_REMOTE_FENCE_BIT) + +#define SCIF_MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL) +#define SCIF_KMEM_UNALIGNED_BUF_SIZE (SCIF_MAX_UNALIGNED_BUF_SIZE + \ + (L1_CACHE_BYTES << 1)) + +#define SCIF_IOVA_START_PFN (1) +#define SCIF_IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) +#define SCIF_DMA_64BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(64)) +#define SCIF_DMA_63BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(63)) + +/* + * struct scif_endpt_rma_info - Per Endpoint Remote Memory Access Information + * + * @reg_list: List of registration windows for self + * @remote_reg_list: List of registration windows for peer + * @iovad: Offset generator + * @rma_lock: Synchronizes access to self/remote list and also protects the + * window from being destroyed while RMAs are in progress. + * @tc_lock: Synchronizes access to temporary cached windows list + * for SCIF Registration Caching. + * @mmn_lock: Synchronizes access to the list of MMU notifiers registered + * @tw_refcount: Keeps track of number of outstanding temporary registered + * windows created by scif_vreadfrom/scif_vwriteto which have + * not been destroyed. + * @tcw_refcount: Same as tw_refcount but for temporary cached windows + * @tcw_total_pages: Same as tcw_refcount but in terms of pages pinned + * @mmn_list: MMU notifier so that we can destroy the windows when required + * @fence_refcount: Keeps track of number of outstanding remote fence + * requests which have been received by the peer. + * @dma_chan: DMA channel used for all DMA transfers for this endpoint. + * @async_list_del: Detect asynchronous list entry deletion + * @vma_list: List of vmas with remote memory mappings + * @markwq: Wait queue used for scif_fence_mark/scif_fence_wait +*/ +struct scif_endpt_rma_info { + struct list_head reg_list; + struct list_head remote_reg_list; + struct iova_domain iovad; + struct mutex rma_lock; + spinlock_t tc_lock; + struct mutex mmn_lock; + atomic_t tw_refcount; + atomic_t tcw_refcount; + atomic_t tcw_total_pages; + struct list_head mmn_list; + atomic_t fence_refcount; + struct dma_chan *dma_chan; + int async_list_del; + struct list_head vma_list; + wait_queue_head_t markwq; +}; + +/* + * struct scif_fence_info - used for tracking fence requests + * + * @state: State of this transfer + * @wq: Fences wait on this queue + * @dma_mark: Used for storing the DMA mark + */ +struct scif_fence_info { + enum scif_msg_state state; + struct completion comp; + int dma_mark; +}; + +/* + * struct scif_remote_fence_info - used for tracking remote fence requests + * + * @msg: List of SCIF node QP fence messages + * @list: Link to list of remote fence requests + */ +struct scif_remote_fence_info { + struct scifmsg msg; + struct list_head list; +}; + +/* + * Specifies whether an RMA operation can span across partial windows, a single + * window or multiple contiguous windows. Mmaps can span across partial windows. + * Unregistration can span across complete windows. scif_get_pages() can span a + * single window. A window can also be of type self or peer. + */ +enum scif_window_type { + SCIF_WINDOW_PARTIAL, + SCIF_WINDOW_SINGLE, + SCIF_WINDOW_FULL, + SCIF_WINDOW_SELF, + SCIF_WINDOW_PEER +}; + +/* The number of physical addresses that can be stored in a PAGE. */ +#define SCIF_NR_ADDR_IN_PAGE (0x1000 >> 3) + +/* + * struct scif_rma_lookup - RMA lookup data structure for page list transfers + * + * Store an array of lookup offsets. Each offset in this array maps + * one 4K page containing 512 physical addresses i.e. 2MB. 512 such + * offsets in a 4K page will correspond to 1GB of registered address space. + + * @lookup: Array of offsets + * @offset: DMA offset of lookup array + */ +struct scif_rma_lookup { + dma_addr_t *lookup; + dma_addr_t offset; +}; + +/* + * struct scif_pinned_pages - A set of pinned pages obtained with + * scif_pin_pages() which could be part of multiple registered + * windows across different end points. + * + * @nr_pages: Number of pages which is defined as a s64 instead of an int + * to avoid sign extension with buffers >= 2GB + * @prot: read/write protections + * @map_flags: Flags specified during the pin operation + * @ref_count: Reference count bumped in terms of number of pages + * @magic: A magic value + * @pages: Array of pointers to struct pages populated with get_user_pages(..) + */ +struct scif_pinned_pages { + s64 nr_pages; + int prot; + int map_flags; + atomic_t ref_count; + u64 magic; + struct page **pages; +}; + +/* + * struct scif_status - Stores DMA status update information + * + * @src_dma_addr: Source buffer DMA address + * @val: src location for value to be written to the destination + * @ep: SCIF endpoint + */ +struct scif_status { + dma_addr_t src_dma_addr; + u64 val; + struct scif_endpt *ep; +}; + +/* + * struct scif_window - Registration Window for Self and Remote + * + * @nr_pages: Number of pages which is defined as a s64 instead of an int + * to avoid sign extension with buffers >= 2GB + * @nr_contig_chunks: Number of contiguous physical chunks + * @prot: read/write protections + * @ref_count: reference count in terms of number of pages + * @magic: Cookie to detect corruption + * @offset: registered offset + * @va_for_temp: va address that this window represents + * @dma_mark: Used to determine if all DMAs against the window are done + * @ep: Pointer to EP. Useful for passing EP around with messages to + avoid expensive list traversals. + * @list: link to list of windows for the endpoint + * @type: self or peer window + * @peer_window: Pointer to peer window. Useful for sending messages to peer + * without requiring an extra list traversal + * @unreg_state: unregistration state + * @offset_freed: True if the offset has been freed + * @temp: True for temporary windows created via scif_vreadfrom/scif_vwriteto + * @mm: memory descriptor for the task_struct which initiated the RMA + * @st: scatter gather table for DMA mappings with IOMMU enabled + * @pinned_pages: The set of pinned_pages backing this window + * @alloc_handle: Handle for sending ALLOC_REQ + * @regwq: Wait Queue for an registration (N)ACK + * @reg_state: Registration state + * @unregwq: Wait Queue for an unregistration (N)ACK + * @dma_addr_lookup: Lookup for physical addresses used for DMA + * @nr_lookup: Number of entries in lookup + * @mapped_offset: Offset used to map the window by the peer + * @dma_addr: Array of physical addresses used for Mgmt node & MIC initiated DMA + * @num_pages: Array specifying number of pages for each physical address + */ +struct scif_window { + s64 nr_pages; + int nr_contig_chunks; + int prot; + int ref_count; + u64 magic; + s64 offset; + unsigned long va_for_temp; + int dma_mark; + u64 ep; + struct list_head list; + enum scif_window_type type; + u64 peer_window; + enum scif_msg_state unreg_state; + bool offset_freed; + bool temp; + struct mm_struct *mm; + struct sg_table *st; + union { + struct { + struct scif_pinned_pages *pinned_pages; + struct scif_allocmsg alloc_handle; + wait_queue_head_t regwq; + enum scif_msg_state reg_state; + wait_queue_head_t unregwq; + }; + struct { + struct scif_rma_lookup dma_addr_lookup; + struct scif_rma_lookup num_pages_lookup; + int nr_lookup; + dma_addr_t mapped_offset; + }; + }; + dma_addr_t *dma_addr; + u64 *num_pages; +} __packed; + +/* + * scif_mmu_notif - SCIF mmu notifier information + * + * @mmu_notifier ep_mmu_notifier: MMU notifier operations + * @tc_reg_list: List of temp registration windows for self + * @mm: memory descriptor for the task_struct which initiated the RMA + * @ep: SCIF endpoint + * @list: link to list of MMU notifier information + */ +struct scif_mmu_notif { +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier ep_mmu_notifier; +#endif + struct list_head tc_reg_list; + struct mm_struct *mm; + struct scif_endpt *ep; + struct list_head list; +}; + +enum scif_rma_dir { + SCIF_LOCAL_TO_REMOTE, + SCIF_REMOTE_TO_LOCAL +}; + +extern struct kmem_cache *unaligned_cache; +/* Initialize RMA for this EP */ +void scif_rma_ep_init(struct scif_endpt *ep); +/* Check if epd can be uninitialized */ +int scif_rma_ep_can_uninit(struct scif_endpt *ep); +/* Obtain a new offset. Callee must grab RMA lock */ +int scif_get_window_offset(struct scif_endpt *ep, int flags, + s64 offset, int nr_pages, s64 *out_offset); +/* Free offset. Callee must grab RMA lock */ +void scif_free_window_offset(struct scif_endpt *ep, + struct scif_window *window, s64 offset); +/* Create self registration window */ +struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, + s64 offset, bool temp); +/* Destroy self registration window.*/ +int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window); +void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window); +/* Map pages of self window to Aperture/PCI */ +int scif_map_window(struct scif_dev *remote_dev, + struct scif_window *window); +/* Unregister a self window */ +int scif_unregister_window(struct scif_window *window); +/* Destroy remote registration window */ +void +scif_destroy_remote_window(struct scif_window *window); +/* remove valid remote memory mappings from process address space */ +void scif_zap_mmaps(int node); +/* Query if any applications have remote memory mappings */ +bool scif_rma_do_apps_have_mmaps(int node); +/* Cleanup remote registration lists for zombie endpoints */ +void scif_cleanup_rma_for_zombies(int node); +/* Reserve a DMA channel for a particular endpoint */ +int scif_reserve_dma_chan(struct scif_endpt *ep); +/* Setup a DMA mark for an endpoint */ +int _scif_fence_mark(scif_epd_t epd, int *mark); +int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, + enum scif_window_type type); +void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_mmu_notif_handler(struct work_struct *work); +void scif_rma_handle_remote_fences(void); +void scif_rma_destroy_windows(void); +void scif_rma_destroy_tcw_invalid(void); +int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan); + +struct scif_window_iter { + s64 offset; + int index; +}; + +static inline void +scif_init_window_iter(struct scif_window *window, struct scif_window_iter *iter) +{ + iter->offset = window->offset; + iter->index = 0; +} + +dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, + size_t *nr_bytes, + struct scif_window_iter *iter); +static inline +dma_addr_t __scif_off_to_dma_addr(struct scif_window *window, s64 off) +{ + return scif_off_to_dma_addr(window, off, NULL, NULL); +} + +static inline bool scif_unaligned(off_t src_offset, off_t dst_offset) +{ + src_offset = src_offset & (L1_CACHE_BYTES - 1); + dst_offset = dst_offset & (L1_CACHE_BYTES - 1); + return !(src_offset == dst_offset); +} + +/* + * scif_zalloc: + * @size: Size of the allocation request. + * + * Helper API which attempts to allocate zeroed pages via + * __get_free_pages(..) first and then falls back on + * vzalloc(..) if that fails. + */ +static inline void *scif_zalloc(size_t size) +{ + void *ret = NULL; + size_t align = ALIGN(size, PAGE_SIZE); + + if (align && get_order(align) < MAX_ORDER) + ret = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(align)); + return ret ? ret : vzalloc(align); +} + +/* + * scif_free: + * @addr: Address to be freed. + * @size: Size of the allocation. + * Helper API which frees memory allocated via scif_zalloc(). + */ +static inline void scif_free(void *addr, size_t size) +{ + size_t align = ALIGN(size, PAGE_SIZE); + + if (is_vmalloc_addr(addr)) + vfree(addr); + else + free_pages((unsigned long)addr, get_order(align)); +} + +static inline void scif_get_window(struct scif_window *window, int nr_pages) +{ + window->ref_count += nr_pages; +} + +static inline void scif_put_window(struct scif_window *window, int nr_pages) +{ + window->ref_count -= nr_pages; +} + +static inline void scif_set_window_ref(struct scif_window *window, int nr_pages) +{ + window->ref_count = nr_pages; +} + +static inline void +scif_queue_for_cleanup(struct scif_window *window, struct list_head *list) +{ + spin_lock(&scif_info.rmalock); + list_add_tail(&window->list, list); + spin_unlock(&scif_info.rmalock); + schedule_work(&scif_info.misc_work); +} + +static inline void __scif_rma_destroy_tcw_helper(struct scif_window *window) +{ + list_del_init(&window->list); + scif_queue_for_cleanup(window, &scif_info.rma_tc); +} + +static inline bool scif_is_iommu_enabled(void) +{ +#ifdef CONFIG_INTEL_IOMMU + return intel_iommu_enabled; +#else + return false; +#endif +} +#endif /* SCIF_RMA_H */ diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c new file mode 100644 index 000000000..e1ef8daed --- /dev/null +++ b/drivers/misc/mic/scif/scif_rma_list.c @@ -0,0 +1,291 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" +#include <linux/mmu_notifier.h> +#include <linux/highmem.h> + +/* + * scif_insert_tcw: + * + * Insert a temp window to the temp registration list sorted by va_for_temp. + * RMA lock must be held. + */ +void scif_insert_tcw(struct scif_window *window, struct list_head *head) +{ + struct scif_window *curr = NULL; + struct scif_window *prev = list_entry(head, struct scif_window, list); + struct list_head *item; + + INIT_LIST_HEAD(&window->list); + /* Compare with tail and if the entry is new tail add it to the end */ + if (!list_empty(head)) { + curr = list_entry(head->prev, struct scif_window, list); + if (curr->va_for_temp < window->va_for_temp) { + list_add_tail(&window->list, head); + return; + } + } + list_for_each(item, head) { + curr = list_entry(item, struct scif_window, list); + if (curr->va_for_temp > window->va_for_temp) + break; + prev = curr; + } + list_add(&window->list, &prev->list); +} + +/* + * scif_insert_window: + * + * Insert a window to the self registration list sorted by offset. + * RMA lock must be held. + */ +void scif_insert_window(struct scif_window *window, struct list_head *head) +{ + struct scif_window *curr = NULL, *prev = NULL; + struct list_head *item; + + INIT_LIST_HEAD(&window->list); + list_for_each(item, head) { + curr = list_entry(item, struct scif_window, list); + if (curr->offset > window->offset) + break; + prev = curr; + } + if (!prev) + list_add(&window->list, head); + else + list_add(&window->list, &prev->list); + scif_set_window_ref(window, window->nr_pages); +} + +/* + * scif_query_tcw: + * + * Query the temp cached registration list of ep for an overlapping window + * in case of permission mismatch, destroy the previous window. if permissions + * match and overlap is partial, destroy the window but return the new range + * RMA lock must be held. + */ +int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *req) +{ + struct list_head *item, *temp, *head = req->head; + struct scif_window *window; + u64 start_va_window, start_va_req = req->va_for_temp; + u64 end_va_window, end_va_req = start_va_req + req->nr_bytes; + + if (!req->nr_bytes) + return -EINVAL; + /* + * Avoid traversing the entire list to find out that there + * is no entry that matches + */ + if (!list_empty(head)) { + window = list_last_entry(head, struct scif_window, list); + end_va_window = window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + if (start_va_req > end_va_window) + return -ENXIO; + } + list_for_each_safe(item, temp, head) { + window = list_entry(item, struct scif_window, list); + start_va_window = window->va_for_temp; + end_va_window = window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + if (start_va_req < start_va_window && + end_va_req < start_va_window) + break; + if (start_va_req >= end_va_window) + continue; + if ((window->prot & req->prot) == req->prot) { + if (start_va_req >= start_va_window && + end_va_req <= end_va_window) { + *req->out_window = window; + return 0; + } + /* expand window */ + if (start_va_req < start_va_window) { + req->nr_bytes += + start_va_window - start_va_req; + req->va_for_temp = start_va_window; + } + if (end_va_req >= end_va_window) + req->nr_bytes += end_va_window - end_va_req; + } + /* Destroy the old window to create a new one */ + __scif_rma_destroy_tcw_helper(window); + break; + } + return -ENXIO; +} + +/* + * scif_query_window: + * + * Query the registration list and check if a valid contiguous + * range of windows exist. + * RMA lock must be held. + */ +int scif_query_window(struct scif_rma_req *req) +{ + struct list_head *item; + struct scif_window *window; + s64 end_offset, offset = req->offset; + u64 tmp_min, nr_bytes_left = req->nr_bytes; + + if (!req->nr_bytes) + return -EINVAL; + + list_for_each(item, req->head) { + window = list_entry(item, struct scif_window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + if (offset < window->offset) + /* Offset not found! */ + return -ENXIO; + if (offset >= end_offset) + continue; + /* Check read/write protections. */ + if ((window->prot & req->prot) != req->prot) + return -EPERM; + if (nr_bytes_left == req->nr_bytes) + /* Store the first window */ + *req->out_window = window; + tmp_min = min((u64)end_offset - offset, nr_bytes_left); + nr_bytes_left -= tmp_min; + offset += tmp_min; + /* + * Range requested encompasses + * multiple windows contiguously. + */ + if (!nr_bytes_left) { + /* Done for partial window */ + if (req->type == SCIF_WINDOW_PARTIAL || + req->type == SCIF_WINDOW_SINGLE) + return 0; + /* Extra logic for full windows */ + if (offset == end_offset) + /* Spanning multiple whole windows */ + return 0; + /* Not spanning multiple whole windows */ + return -ENXIO; + } + if (req->type == SCIF_WINDOW_SINGLE) + break; + } + dev_err(scif_info.mdev.this_device, + "%s %d ENXIO\n", __func__, __LINE__); + return -ENXIO; +} + +/* + * scif_rma_list_unregister: + * + * Traverse the self registration list starting from window: + * 1) Call scif_unregister_window(..) + * RMA lock must be held. + */ +int scif_rma_list_unregister(struct scif_window *window, + s64 offset, int nr_pages) +{ + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + struct list_head *head = &ep->rma_info.reg_list; + s64 end_offset; + int err = 0; + int loop_nr_pages; + struct scif_window *_window; + + list_for_each_entry_safe_from(window, _window, head, list) { + end_offset = window->offset + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT), + nr_pages); + err = scif_unregister_window(window); + if (err) + return err; + nr_pages -= loop_nr_pages; + offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages) + break; + } + return 0; +} + +/* + * scif_unmap_all_window: + * + * Traverse all the windows in the self registration list and: + * 1) Delete any DMA mappings created + */ +void scif_unmap_all_windows(scif_epd_t epd) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct list_head *head = &ep->rma_info.reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct scif_window, list); + scif_unmap_window(ep->remote_dev, window); + } + mutex_unlock(&ep->rma_info.rma_lock); +} + +/* + * scif_unregister_all_window: + * + * Traverse all the windows in the self registration list and: + * 1) Call scif_unregister_window(..) + * RMA lock must be held. + */ +int scif_unregister_all_windows(scif_epd_t epd) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct list_head *head = &ep->rma_info.reg_list; + int err = 0; + + mutex_lock(&ep->rma_info.rma_lock); +retry: + item = NULL; + tmp = NULL; + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct scif_window, list); + ep->rma_info.async_list_del = 0; + err = scif_unregister_window(window); + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", + __func__, __LINE__, err); + /* + * Need to restart list traversal if there has been + * an asynchronous list entry deletion. + */ + if (ACCESS_ONCE(ep->rma_info.async_list_del)) + goto retry; + } + mutex_unlock(&ep->rma_info.rma_lock); + if (!list_empty(&ep->rma_info.mmn_list)) { + spin_lock(&scif_info.rmalock); + list_add_tail(&ep->mmu_list, &scif_info.mmu_notif_cleanup); + spin_unlock(&scif_info.rmalock); + schedule_work(&scif_info.mmu_notif_work); + } + return err; +} diff --git a/drivers/misc/mic/scif/scif_rma_list.h b/drivers/misc/mic/scif/scif_rma_list.h new file mode 100644 index 000000000..7d58d1d55 --- /dev/null +++ b/drivers/misc/mic/scif/scif_rma_list.h @@ -0,0 +1,57 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_RMA_LIST_H +#define SCIF_RMA_LIST_H + +/* + * struct scif_rma_req - Self Registration list RMA Request query + * + * @out_window - Returns the window if found + * @offset: Starting offset + * @nr_bytes: number of bytes + * @prot: protection requested i.e. read or write or both + * @type: Specify single, partial or multiple windows + * @head: Head of list on which to search + * @va_for_temp: VA for searching temporary cached windows + */ +struct scif_rma_req { + struct scif_window **out_window; + union { + s64 offset; + unsigned long va_for_temp; + }; + size_t nr_bytes; + int prot; + enum scif_window_type type; + struct list_head *head; +}; + +/* Insert */ +void scif_insert_window(struct scif_window *window, struct list_head *head); +void scif_insert_tcw(struct scif_window *window, + struct list_head *head); +/* Query */ +int scif_query_window(struct scif_rma_req *request); +int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *request); +/* Called from close to unregister all self windows */ +int scif_unregister_all_windows(scif_epd_t epd); +void scif_unmap_all_windows(scif_epd_t epd); +/* Traverse list and unregister */ +int scif_rma_list_unregister(struct scif_window *window, s64 offset, + int nr_pages); +#endif /* SCIF_RMA_LIST_H */ |