From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- drivers/gpu/drm/vc4/vc4_bo.c | 4 +- drivers/gpu/drm/vc4/vc4_crtc.c | 183 +++++++++++- drivers/gpu/drm/vc4/vc4_dpi.c | 23 +- drivers/gpu/drm/vc4/vc4_drv.c | 70 +++-- drivers/gpu/drm/vc4/vc4_drv.h | 12 +- drivers/gpu/drm/vc4/vc4_gem.c | 25 +- drivers/gpu/drm/vc4/vc4_hdmi.c | 22 +- drivers/gpu/drm/vc4/vc4_kms.c | 15 +- drivers/gpu/drm/vc4/vc4_plane.c | 13 +- drivers/gpu/drm/vc4/vc4_qpu_defines.h | 17 +- drivers/gpu/drm/vc4/vc4_regs.h | 22 +- drivers/gpu/drm/vc4/vc4_validate.c | 13 +- drivers/gpu/drm/vc4/vc4_validate_shaders.c | 455 ++++++++++++++++++++++++++--- 13 files changed, 729 insertions(+), 145 deletions(-) (limited to 'drivers/gpu/drm/vc4') diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c index e5a9d3aaf..3f6704cf6 100644 --- a/drivers/gpu/drm/vc4/vc4_bo.c +++ b/drivers/gpu/drm/vc4/vc4_bo.c @@ -144,7 +144,7 @@ static struct list_head *vc4_get_cache_list_for_size(struct drm_device *dev, return &vc4->bo_cache.size_list[page_index]; } -void vc4_bo_cache_purge(struct drm_device *dev) +static void vc4_bo_cache_purge(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); @@ -291,8 +291,6 @@ static void vc4_bo_cache_free_old(struct drm_device *dev) /* Called on the last userspace/kernel unreference of the BO. Returns * it to the BO cache if possible, otherwise frees it. - * - * Note that this is called with the struct_mutex held. */ void vc4_free_object(struct drm_gem_object *gem_bo) { diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c index 0f18b76c7..8fc2b731b 100644 --- a/drivers/gpu/drm/vc4/vc4_crtc.c +++ b/drivers/gpu/drm/vc4/vc4_crtc.c @@ -46,12 +46,17 @@ struct vc4_crtc { const struct vc4_crtc_data *data; void __iomem *regs; + /* Timestamp at start of vblank irq - unaffected by lock delays. */ + ktime_t t_vblank; + /* Which HVS channel we're using for our CRTC. */ int channel; u8 lut_r[256]; u8 lut_g[256]; u8 lut_b[256]; + /* Size in pixels of the COB memory allocated to this CRTC. */ + u32 cob_size; struct drm_pending_vblank_event *event; }; @@ -146,6 +151,144 @@ int vc4_crtc_debugfs_regs(struct seq_file *m, void *unused) } #endif +int vc4_crtc_get_scanoutpos(struct drm_device *dev, unsigned int crtc_id, + unsigned int flags, int *vpos, int *hpos, + ktime_t *stime, ktime_t *etime, + const struct drm_display_mode *mode) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_crtc *vc4_crtc = vc4->crtc[crtc_id]; + u32 val; + int fifo_lines; + int vblank_lines; + int ret = 0; + + /* + * XXX Doesn't work well in interlaced mode yet, partially due + * to problems in vc4 kms or drm core interlaced mode handling, + * so disable for now in interlaced mode. + */ + if (mode->flags & DRM_MODE_FLAG_INTERLACE) + return ret; + + /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + + /* Get optional system timestamp before query. */ + if (stime) + *stime = ktime_get(); + + /* + * Read vertical scanline which is currently composed for our + * pixelvalve by the HVS, and also the scaler status. + */ + val = HVS_READ(SCALER_DISPSTATX(vc4_crtc->channel)); + + /* Get optional system timestamp after query. */ + if (etime) + *etime = ktime_get(); + + /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + + /* Vertical position of hvs composed scanline. */ + *vpos = VC4_GET_FIELD(val, SCALER_DISPSTATX_LINE); + + /* No hpos info available. */ + if (hpos) + *hpos = 0; + + /* This is the offset we need for translating hvs -> pv scanout pos. */ + fifo_lines = vc4_crtc->cob_size / mode->crtc_hdisplay; + + if (fifo_lines > 0) + ret |= DRM_SCANOUTPOS_VALID; + + /* HVS more than fifo_lines into frame for compositing? */ + if (*vpos > fifo_lines) { + /* + * We are in active scanout and can get some meaningful results + * from HVS. The actual PV scanout can not trail behind more + * than fifo_lines as that is the fifo's capacity. Assume that + * in active scanout the HVS and PV work in lockstep wrt. HVS + * refilling the fifo and PV consuming from the fifo, ie. + * whenever the PV consumes and frees up a scanline in the + * fifo, the HVS will immediately refill it, therefore + * incrementing vpos. Therefore we choose HVS read position - + * fifo size in scanlines as a estimate of the real scanout + * position of the PV. + */ + *vpos -= fifo_lines + 1; + if (mode->flags & DRM_MODE_FLAG_INTERLACE) + *vpos /= 2; + + ret |= DRM_SCANOUTPOS_ACCURATE; + return ret; + } + + /* + * Less: This happens when we are in vblank and the HVS, after getting + * the VSTART restart signal from the PV, just started refilling its + * fifo with new lines from the top-most lines of the new framebuffers. + * The PV does not scan out in vblank, so does not remove lines from + * the fifo, so the fifo will be full quickly and the HVS has to pause. + * We can't get meaningful readings wrt. scanline position of the PV + * and need to make things up in a approximative but consistent way. + */ + ret |= DRM_SCANOUTPOS_IN_VBLANK; + vblank_lines = mode->crtc_vtotal - mode->crtc_vdisplay; + + if (flags & DRM_CALLED_FROM_VBLIRQ) { + /* + * Assume the irq handler got called close to first + * line of vblank, so PV has about a full vblank + * scanlines to go, and as a base timestamp use the + * one taken at entry into vblank irq handler, so it + * is not affected by random delays due to lock + * contention on event_lock or vblank_time lock in + * the core. + */ + *vpos = -vblank_lines; + + if (stime) + *stime = vc4_crtc->t_vblank; + if (etime) + *etime = vc4_crtc->t_vblank; + + /* + * If the HVS fifo is not yet full then we know for certain + * we are at the very beginning of vblank, as the hvs just + * started refilling, and the stime and etime timestamps + * truly correspond to start of vblank. + */ + if ((val & SCALER_DISPSTATX_FULL) != SCALER_DISPSTATX_FULL) + ret |= DRM_SCANOUTPOS_ACCURATE; + } else { + /* + * No clue where we are inside vblank. Return a vpos of zero, + * which will cause calling code to just return the etime + * timestamp uncorrected. At least this is no worse than the + * standard fallback. + */ + *vpos = 0; + } + + return ret; +} + +int vc4_crtc_get_vblank_timestamp(struct drm_device *dev, unsigned int crtc_id, + int *max_error, struct timeval *vblank_time, + unsigned flags) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct vc4_crtc *vc4_crtc = vc4->crtc[crtc_id]; + struct drm_crtc *crtc = &vc4_crtc->base; + struct drm_crtc_state *state = crtc->state; + + /* Helper routine in DRM core does all the work: */ + return drm_calc_vbltimestamp_from_scanoutpos(dev, crtc_id, max_error, + vblank_time, flags, + &state->adjusted_mode); +} + static void vc4_crtc_destroy(struct drm_crtc *crtc) { drm_crtc_cleanup(crtc); @@ -175,20 +318,22 @@ vc4_crtc_lut_load(struct drm_crtc *crtc) HVS_WRITE(SCALER_GAMDATA, vc4_crtc->lut_b[i]); } -static void +static int vc4_crtc_gamma_set(struct drm_crtc *crtc, u16 *r, u16 *g, u16 *b, - uint32_t start, uint32_t size) + uint32_t size) { struct vc4_crtc *vc4_crtc = to_vc4_crtc(crtc); u32 i; - for (i = start; i < start + size; i++) { + for (i = 0; i < size; i++) { vc4_crtc->lut_r[i] = r[i] >> 8; vc4_crtc->lut_g[i] = g[i] >> 8; vc4_crtc->lut_b[i] = b[i] >> 8; } vc4_crtc_lut_load(crtc); + + return 0; } static u32 vc4_get_fifo_full_level(u32 format) @@ -395,6 +540,7 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc, struct vc4_dev *vc4 = to_vc4_dev(dev); struct drm_plane *plane; unsigned long flags; + const struct drm_plane_state *plane_state; u32 dlist_count = 0; int ret; @@ -404,18 +550,8 @@ static int vc4_crtc_atomic_check(struct drm_crtc *crtc, if (hweight32(state->connector_mask) > 1) return -EINVAL; - drm_atomic_crtc_state_for_each_plane(plane, state) { - struct drm_plane_state *plane_state = - state->state->plane_states[drm_plane_index(plane)]; - - /* plane might not have changed, in which case take - * current state: - */ - if (!plane_state) - plane_state = plane->state; - + drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, state) dlist_count += vc4_plane_dlist_size(plane_state); - } dlist_count++; /* Account for SCALER_CTL0_END. */ @@ -526,6 +662,7 @@ static irqreturn_t vc4_crtc_irq_handler(int irq, void *data) irqreturn_t ret = IRQ_NONE; if (stat & PV_INT_VFP_START) { + vc4_crtc->t_vblank = ktime_get(); CRTC_WRITE(PV_INTSTAT, PV_INT_VFP_START); drm_crtc_handle_vblank(&vc4_crtc->base); vc4_crtc_handle_page_flip(vc4_crtc); @@ -730,6 +867,22 @@ static void vc4_set_crtc_possible_masks(struct drm_device *drm, } } +static void +vc4_crtc_get_cob_allocation(struct vc4_crtc *vc4_crtc) +{ + struct drm_device *drm = vc4_crtc->base.dev; + struct vc4_dev *vc4 = to_vc4_dev(drm); + u32 dispbase = HVS_READ(SCALER_DISPBASEX(vc4_crtc->channel)); + /* Top/base are supposed to be 4-pixel aligned, but the + * Raspberry Pi firmware fills the low bits (which are + * presumably ignored). + */ + u32 top = VC4_GET_FIELD(dispbase, SCALER_DISPBASEX_TOP) & ~3; + u32 base = VC4_GET_FIELD(dispbase, SCALER_DISPBASEX_BASE) & ~3; + + vc4_crtc->cob_size = top - base + 4; +} + static int vc4_crtc_bind(struct device *dev, struct device *master, void *data) { struct platform_device *pdev = to_platform_device(dev); @@ -806,6 +959,8 @@ static int vc4_crtc_bind(struct device *dev, struct device *master, void *data) crtc->cursor = cursor_plane; } + vc4_crtc_get_cob_allocation(vc4_crtc); + CRTC_WRITE(PV_INTEN, 0); CRTC_WRITE(PV_INTSTAT, PV_INT_VFP_START); ret = devm_request_irq(dev, platform_get_irq(pdev, 0), diff --git a/drivers/gpu/drm/vc4/vc4_dpi.c b/drivers/gpu/drm/vc4/vc4_dpi.c index 9817dbfa4..275fedbdb 100644 --- a/drivers/gpu/drm/vc4/vc4_dpi.c +++ b/drivers/gpu/drm/vc4/vc4_dpi.c @@ -208,14 +208,6 @@ static int vc4_dpi_connector_get_modes(struct drm_connector *connector) return 0; } -static struct drm_encoder * -vc4_dpi_connector_best_encoder(struct drm_connector *connector) -{ - struct vc4_dpi_connector *dpi_connector = - to_vc4_dpi_connector(connector); - return dpi_connector->encoder; -} - static const struct drm_connector_funcs vc4_dpi_connector_funcs = { .dpms = drm_atomic_helper_connector_dpms, .detect = vc4_dpi_connector_detect, @@ -228,7 +220,6 @@ static const struct drm_connector_funcs vc4_dpi_connector_funcs = { static const struct drm_connector_helper_funcs vc4_dpi_connector_helper_funcs = { .get_modes = vc4_dpi_connector_get_modes, - .best_encoder = vc4_dpi_connector_best_encoder, }; static struct drm_connector *vc4_dpi_connector_init(struct drm_device *dev, @@ -236,14 +227,12 @@ static struct drm_connector *vc4_dpi_connector_init(struct drm_device *dev, { struct drm_connector *connector = NULL; struct vc4_dpi_connector *dpi_connector; - int ret = 0; dpi_connector = devm_kzalloc(dev->dev, sizeof(*dpi_connector), GFP_KERNEL); - if (!dpi_connector) { - ret = -ENOMEM; - goto fail; - } + if (!dpi_connector) + return ERR_PTR(-ENOMEM); + connector = &dpi_connector->base; dpi_connector->encoder = dpi->encoder; @@ -260,12 +249,6 @@ static struct drm_connector *vc4_dpi_connector_init(struct drm_device *dev, drm_mode_connector_attach_encoder(connector, dpi->encoder); return connector; - - fail: - if (connector) - vc4_dpi_connector_destroy(connector); - - return ERR_PTR(ret); } static const struct drm_encoder_funcs vc4_dpi_encoder_funcs = { diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 250ed7e37..9ecef9385 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "drm_fb_cma_helper.h" #include "uapi/drm/vc4_drm.h" @@ -43,12 +44,54 @@ void __iomem *vc4_ioremap_regs(struct platform_device *dev, int index) return map; } +static int vc4_get_param_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct drm_vc4_get_param *args = data; + int ret; + + if (args->pad != 0) + return -EINVAL; + + switch (args->param) { + case DRM_VC4_PARAM_V3D_IDENT0: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret < 0) + return ret; + args->value = V3D_READ(V3D_IDENT0); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + case DRM_VC4_PARAM_V3D_IDENT1: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret < 0) + return ret; + args->value = V3D_READ(V3D_IDENT1); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + case DRM_VC4_PARAM_V3D_IDENT2: + ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev); + if (ret < 0) + return ret; + args->value = V3D_READ(V3D_IDENT2); + pm_runtime_put(&vc4->v3d->pdev->dev); + break; + case DRM_VC4_PARAM_SUPPORTS_BRANCHES: + args->value = true; + break; + default: + DRM_DEBUG("Unknown parameter %d\n", args->param); + return -EINVAL; + } + + return 0; +} + static void vc4_lastclose(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - if (vc4->fbdev) - drm_fbdev_cma_restore_mode(vc4->fbdev); + drm_fbdev_cma_restore_mode(vc4->fbdev); } static const struct file_operations vc4_drm_fops = { @@ -74,6 +117,7 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = { DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl, DRM_ROOT_ONLY), + DRM_IOCTL_DEF_DRV(VC4_GET_PARAM, vc4_get_param_ioctl, DRM_RENDER_ALLOW), }; static struct drm_driver vc4_drm_driver = { @@ -92,6 +136,8 @@ static struct drm_driver vc4_drm_driver = { .enable_vblank = vc4_enable_vblank, .disable_vblank = vc4_disable_vblank, .get_vblank_counter = drm_vblank_no_hw_counter, + .get_scanout_position = vc4_crtc_get_scanoutpos, + .get_vblank_timestamp = vc4_crtc_get_vblank_timestamp, #if defined(CONFIG_DEBUG_FS) .debugfs_init = vc4_debugfs_init, @@ -99,7 +145,7 @@ static struct drm_driver vc4_drm_driver = { #endif .gem_create_object = vc4_create_object, - .gem_free_object = vc4_free_object, + .gem_free_object_unlocked = vc4_free_object, .gem_vm_ops = &drm_gem_cma_vm_ops, .prime_handle_to_fd = drm_gem_prime_handle_to_fd, @@ -176,7 +222,6 @@ static int vc4_drm_bind(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct drm_device *drm; - struct drm_connector *connector; struct vc4_dev *vc4; int ret = 0; @@ -196,8 +241,6 @@ static int vc4_drm_bind(struct device *dev) vc4_bo_cache_init(drm); drm_mode_config_init(drm); - if (ret) - goto unref; vc4_gem_init(drm); @@ -211,27 +254,14 @@ static int vc4_drm_bind(struct device *dev) if (ret < 0) goto unbind_all; - /* Connector registration has to occur after DRM device - * registration, because it creates sysfs entries based on the - * DRM device. - */ - list_for_each_entry(connector, &drm->mode_config.connector_list, head) { - ret = drm_connector_register(connector); - if (ret) - goto unregister; - } - vc4_kms_load(drm); return 0; -unregister: - drm_dev_unregister(drm); unbind_all: component_unbind_all(dev, drm); gem_destroy: vc4_gem_destroy(drm); -unref: drm_dev_unref(drm); vc4_bo_cache_destroy(drm); return ret; @@ -259,8 +289,8 @@ static const struct component_master_ops vc4_drm_ops = { static struct platform_driver *const component_drivers[] = { &vc4_hdmi_driver, &vc4_dpi_driver, - &vc4_crtc_driver, &vc4_hvs_driver, + &vc4_crtc_driver, &vc4_v3d_driver, }; diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h index 2e24616cd..428e24919 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -364,6 +364,9 @@ struct vc4_validated_shader_info { uint32_t uniforms_src_size; uint32_t num_texture_samples; struct vc4_texture_sample_info *texture_samples; + + uint32_t num_uniform_addr_offsets; + uint32_t *uniform_addr_offsets; }; /** @@ -424,6 +427,13 @@ extern struct platform_driver vc4_crtc_driver; int vc4_enable_vblank(struct drm_device *dev, unsigned int crtc_id); void vc4_disable_vblank(struct drm_device *dev, unsigned int crtc_id); int vc4_crtc_debugfs_regs(struct seq_file *m, void *arg); +int vc4_crtc_get_scanoutpos(struct drm_device *dev, unsigned int crtc_id, + unsigned int flags, int *vpos, int *hpos, + ktime_t *stime, ktime_t *etime, + const struct drm_display_mode *mode); +int vc4_crtc_get_vblank_timestamp(struct drm_device *dev, unsigned int crtc_id, + int *max_error, struct timeval *vblank_time, + unsigned flags); /* vc4_debugfs.c */ int vc4_debugfs_init(struct drm_minor *minor); @@ -478,7 +488,7 @@ int vc4_kms_load(struct drm_device *dev); struct drm_plane *vc4_plane_init(struct drm_device *dev, enum drm_plane_type type); u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist); -u32 vc4_plane_dlist_size(struct drm_plane_state *state); +u32 vc4_plane_dlist_size(const struct drm_plane_state *state); void vc4_plane_async_set_fb(struct drm_plane *plane, struct drm_framebuffer *fb); diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 78ab08e8f..b262c5c26 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -53,10 +53,8 @@ vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state) { unsigned int i; - mutex_lock(&dev->struct_mutex); for (i = 0; i < state->user_state.bo_count; i++) - drm_gem_object_unreference(state->bo[i]); - mutex_unlock(&dev->struct_mutex); + drm_gem_object_unreference_unlocked(state->bo[i]); kfree(state); } @@ -536,8 +534,8 @@ vc4_cl_lookup_bos(struct drm_device *dev, return -EINVAL; } - exec->bo = kcalloc(exec->bo_count, sizeof(struct drm_gem_cma_object *), - GFP_KERNEL); + exec->bo = drm_calloc_large(exec->bo_count, + sizeof(struct drm_gem_cma_object *)); if (!exec->bo) { DRM_ERROR("Failed to allocate validated BO pointers\n"); return -ENOMEM; @@ -610,7 +608,7 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec) * read the contents back for validation, and I think the * bo->vaddr is uncached access. */ - temp = kmalloc(temp_size, GFP_KERNEL); + temp = drm_malloc_ab(temp_size, 1); if (!temp) { DRM_ERROR("Failed to allocate storage for copying " "in bin/render CLs.\n"); @@ -677,7 +675,7 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec) ret = vc4_validate_shader_recs(dev, exec); fail: - kfree(temp); + drm_free_large(temp); return ret; } @@ -687,21 +685,18 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) struct vc4_dev *vc4 = to_vc4_dev(dev); unsigned i; - /* Need the struct lock for drm_gem_object_unreference(). */ - mutex_lock(&dev->struct_mutex); if (exec->bo) { for (i = 0; i < exec->bo_count; i++) - drm_gem_object_unreference(&exec->bo[i]->base); - kfree(exec->bo); + drm_gem_object_unreference_unlocked(&exec->bo[i]->base); + drm_free_large(exec->bo); } while (!list_empty(&exec->unref_list)) { struct vc4_bo *bo = list_first_entry(&exec->unref_list, struct vc4_bo, unref_head); list_del(&bo->unref_head); - drm_gem_object_unreference(&bo->base.base); + drm_gem_object_unreference_unlocked(&bo->base.base); } - mutex_unlock(&dev->struct_mutex); mutex_lock(&vc4->power_lock); if (--vc4->power_refcount == 0) @@ -947,8 +942,8 @@ vc4_gem_destroy(struct drm_device *dev) vc4->overflow_mem = NULL; } - vc4_bo_cache_destroy(dev); - if (vc4->hang_state) vc4_free_hang_state(dev, vc4->hang_state); + + vc4_bo_cache_destroy(dev); } diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index fd2644d23..4452f3631 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -208,14 +208,6 @@ static int vc4_hdmi_connector_get_modes(struct drm_connector *connector) return ret; } -static struct drm_encoder * -vc4_hdmi_connector_best_encoder(struct drm_connector *connector) -{ - struct vc4_hdmi_connector *hdmi_connector = - to_vc4_hdmi_connector(connector); - return hdmi_connector->encoder; -} - static const struct drm_connector_funcs vc4_hdmi_connector_funcs = { .dpms = drm_atomic_helper_connector_dpms, .detect = vc4_hdmi_connector_detect, @@ -228,7 +220,6 @@ static const struct drm_connector_funcs vc4_hdmi_connector_funcs = { static const struct drm_connector_helper_funcs vc4_hdmi_connector_helper_funcs = { .get_modes = vc4_hdmi_connector_get_modes, - .best_encoder = vc4_hdmi_connector_best_encoder, }; static struct drm_connector *vc4_hdmi_connector_init(struct drm_device *dev, @@ -465,12 +456,6 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data) if (IS_ERR(hdmi->hd_regs)) return PTR_ERR(hdmi->hd_regs); - ddc_node = of_parse_phandle(dev->of_node, "ddc", 0); - if (!ddc_node) { - DRM_ERROR("Failed to find ddc node in device tree\n"); - return -ENODEV; - } - hdmi->pixel_clock = devm_clk_get(dev, "pixel"); if (IS_ERR(hdmi->pixel_clock)) { DRM_ERROR("Failed to get pixel clock\n"); @@ -482,7 +467,14 @@ static int vc4_hdmi_bind(struct device *dev, struct device *master, void *data) return PTR_ERR(hdmi->hsm_clock); } + ddc_node = of_parse_phandle(dev->of_node, "ddc", 0); + if (!ddc_node) { + DRM_ERROR("Failed to find ddc node in device tree\n"); + return -ENODEV; + } + hdmi->ddc = of_find_i2c_adapter_by_node(ddc_node); + of_node_put(ddc_node); if (!hdmi->ddc) { DRM_DEBUG("Failed to get ddc i2c adapter by node\n"); return -EPROBE_DEFER; diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c index 861a623bc..4ac894d99 100644 --- a/drivers/gpu/drm/vc4/vc4_kms.c +++ b/drivers/gpu/drm/vc4/vc4_kms.c @@ -26,8 +26,7 @@ static void vc4_output_poll_changed(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - if (vc4->fbdev) - drm_fbdev_cma_hotplug_event(vc4->fbdev); + drm_fbdev_cma_hotplug_event(vc4->fbdev); } struct vc4_commit { @@ -111,6 +110,8 @@ static int vc4_atomic_commit(struct drm_device *dev, int i; uint64_t wait_seqno = 0; struct vc4_commit *c; + struct drm_plane *plane; + struct drm_plane_state *new_state; c = commit_init(state); if (!c) @@ -138,13 +139,7 @@ static int vc4_atomic_commit(struct drm_device *dev, return ret; } - for (i = 0; i < dev->mode_config.num_total_plane; i++) { - struct drm_plane *plane = state->planes[i]; - struct drm_plane_state *new_state = state->plane_states[i]; - - if (!plane) - continue; - + for_each_plane_in_state(state, plane, new_state, i) { if ((plane->state->fb != new_state->fb) && new_state->fb) { struct drm_gem_cma_object *cma_bo = drm_fb_cma_get_gem_obj(new_state->fb, 0); @@ -160,7 +155,7 @@ static int vc4_atomic_commit(struct drm_device *dev, * the software side now. */ - drm_atomic_helper_swap_state(dev, state); + drm_atomic_helper_swap_state(state, true); /* * Everything below can be run asynchronously without the need to grab diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c index 4037b52fd..29e4b400e 100644 --- a/drivers/gpu/drm/vc4/vc4_plane.c +++ b/drivers/gpu/drm/vc4/vc4_plane.c @@ -93,6 +93,14 @@ static const struct hvs_format { .drm = DRM_FORMAT_ARGB8888, .hvs = HVS_PIXEL_FORMAT_RGBA8888, .pixel_order = HVS_PIXEL_ORDER_ABGR, .has_alpha = true, }, + { + .drm = DRM_FORMAT_ABGR8888, .hvs = HVS_PIXEL_FORMAT_RGBA8888, + .pixel_order = HVS_PIXEL_ORDER_ARGB, .has_alpha = true, + }, + { + .drm = DRM_FORMAT_XBGR8888, .hvs = HVS_PIXEL_FORMAT_RGBA8888, + .pixel_order = HVS_PIXEL_ORDER_ARGB, .has_alpha = false, + }, { .drm = DRM_FORMAT_RGB565, .hvs = HVS_PIXEL_FORMAT_RGB565, .pixel_order = HVS_PIXEL_ORDER_XRGB, .has_alpha = false, @@ -690,9 +698,10 @@ u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist) return vc4_state->dlist_count; } -u32 vc4_plane_dlist_size(struct drm_plane_state *state) +u32 vc4_plane_dlist_size(const struct drm_plane_state *state) { - struct vc4_plane_state *vc4_state = to_vc4_plane_state(state); + const struct vc4_plane_state *vc4_state = + container_of(state, typeof(*vc4_state), base); return vc4_state->dlist_count; } diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h index d5c2f3c85..f4e795a0d 100644 --- a/drivers/gpu/drm/vc4/vc4_qpu_defines.h +++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h @@ -70,7 +70,7 @@ enum qpu_raddr { QPU_R_ELEM_QPU = 38, QPU_R_NOP, QPU_R_XY_PIXEL_COORD = 41, - QPU_R_MS_REV_FLAGS = 41, + QPU_R_MS_REV_FLAGS = 42, QPU_R_VPM = 48, QPU_R_VPM_LD_BUSY, QPU_R_VPM_LD_WAIT, @@ -230,6 +230,15 @@ enum qpu_unpack_r4 { #define QPU_COND_MUL_SHIFT 46 #define QPU_COND_MUL_MASK QPU_MASK(48, 46) +#define QPU_BRANCH_COND_SHIFT 52 +#define QPU_BRANCH_COND_MASK QPU_MASK(55, 52) + +#define QPU_BRANCH_REL ((uint64_t)1 << 51) +#define QPU_BRANCH_REG ((uint64_t)1 << 50) + +#define QPU_BRANCH_RADDR_A_SHIFT 45 +#define QPU_BRANCH_RADDR_A_MASK QPU_MASK(49, 45) + #define QPU_SF ((uint64_t)1 << 45) #define QPU_WADDR_ADD_SHIFT 38 @@ -261,4 +270,10 @@ enum qpu_unpack_r4 { #define QPU_OP_ADD_SHIFT 24 #define QPU_OP_ADD_MASK QPU_MASK(28, 24) +#define QPU_LOAD_IMM_SHIFT 0 +#define QPU_LOAD_IMM_MASK QPU_MASK(31, 0) + +#define QPU_BRANCH_TARGET_SHIFT 0 +#define QPU_BRANCH_TARGET_MASK QPU_MASK(31, 0) + #endif /* VC4_QPU_DEFINES_H */ diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h index f99eece4c..160942a91 100644 --- a/drivers/gpu/drm/vc4/vc4_regs.h +++ b/drivers/gpu/drm/vc4/vc4_regs.h @@ -366,7 +366,6 @@ # define SCALER_DISPBKGND_FILL BIT(24) #define SCALER_DISPSTAT0 0x00000048 -#define SCALER_DISPBASE0 0x0000004c # define SCALER_DISPSTATX_MODE_MASK VC4_MASK(31, 30) # define SCALER_DISPSTATX_MODE_SHIFT 30 # define SCALER_DISPSTATX_MODE_DISABLED 0 @@ -375,6 +374,24 @@ # define SCALER_DISPSTATX_MODE_EOF 3 # define SCALER_DISPSTATX_FULL BIT(29) # define SCALER_DISPSTATX_EMPTY BIT(28) +# define SCALER_DISPSTATX_FRAME_COUNT_MASK VC4_MASK(17, 12) +# define SCALER_DISPSTATX_FRAME_COUNT_SHIFT 12 +# define SCALER_DISPSTATX_LINE_MASK VC4_MASK(11, 0) +# define SCALER_DISPSTATX_LINE_SHIFT 0 + +#define SCALER_DISPBASE0 0x0000004c +/* Last pixel in the COB (display FIFO memory) allocated to this HVS + * channel. Must be 4-pixel aligned (and thus 4 pixels less than the + * next COB base). + */ +# define SCALER_DISPBASEX_TOP_MASK VC4_MASK(31, 16) +# define SCALER_DISPBASEX_TOP_SHIFT 16 +/* First pixel in the COB (display FIFO memory) allocated to this HVS + * channel. Must be 4-pixel aligned. + */ +# define SCALER_DISPBASEX_BASE_MASK VC4_MASK(15, 0) +# define SCALER_DISPBASEX_BASE_SHIFT 0 + #define SCALER_DISPCTRL1 0x00000050 #define SCALER_DISPBKGND1 0x00000054 #define SCALER_DISPBKGNDX(x) (SCALER_DISPBKGND0 + \ @@ -385,6 +402,9 @@ (x) * (SCALER_DISPSTAT1 - \ SCALER_DISPSTAT0)) #define SCALER_DISPBASE1 0x0000005c +#define SCALER_DISPBASEX(x) (SCALER_DISPBASE0 + \ + (x) * (SCALER_DISPBASE1 - \ + SCALER_DISPBASE0)) #define SCALER_DISPCTRL2 0x00000060 #define SCALER_DISPCTRLX(x) (SCALER_DISPCTRL0 + \ (x) * (SCALER_DISPCTRL1 - \ diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c index 24c2c746e..9ce1d0adf 100644 --- a/drivers/gpu/drm/vc4/vc4_validate.c +++ b/drivers/gpu/drm/vc4/vc4_validate.c @@ -802,7 +802,7 @@ validate_gl_shader_rec(struct drm_device *dev, uint32_t src_offset = *(uint32_t *)(pkt_u + o); uint32_t *texture_handles_u; void *uniform_data_u; - uint32_t tex; + uint32_t tex, uni; *(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset; @@ -840,6 +840,17 @@ validate_gl_shader_rec(struct drm_device *dev, } } + /* Fill in the uniform slots that need this shader's + * start-of-uniforms address (used for resetting the uniform + * stream in the presence of control flow). + */ + for (uni = 0; + uni < validated_shader->num_uniform_addr_offsets; + uni++) { + uint32_t o = validated_shader->uniform_addr_offsets[uni]; + ((uint32_t *)exec->uniforms_v)[o] = exec->uniforms_p; + } + *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p; exec->uniforms_u += validated_shader->uniforms_src_size; diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c index f67124b4c..2543cf5b8 100644 --- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c +++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c @@ -39,7 +39,17 @@ #include "vc4_drv.h" #include "vc4_qpu_defines.h" +#define LIVE_REG_COUNT (32 + 32 + 4) + struct vc4_shader_validation_state { + /* Current IP being validated. */ + uint32_t ip; + + /* IP at the end of the BO, do not read shader[max_ip] */ + uint32_t max_ip; + + uint64_t *shader; + struct vc4_texture_sample_info tmu_setup[2]; int tmu_write_count[2]; @@ -49,8 +59,30 @@ struct vc4_shader_validation_state { * * This is used for the validation of direct address memory reads. */ - uint32_t live_min_clamp_offsets[32 + 32 + 4]; - bool live_max_clamp_regs[32 + 32 + 4]; + uint32_t live_min_clamp_offsets[LIVE_REG_COUNT]; + bool live_max_clamp_regs[LIVE_REG_COUNT]; + uint32_t live_immediates[LIVE_REG_COUNT]; + + /* Bitfield of which IPs are used as branch targets. + * + * Used for validation that the uniform stream is updated at the right + * points and clearing the texturing/clamping state. + */ + unsigned long *branch_targets; + + /* Set when entering a basic block, and cleared when the uniform + * address update is found. This is used to make sure that we don't + * read uniforms when the address is undefined. + */ + bool needs_uniform_address_update; + + /* Set when we find a backwards branch. If the branch is backwards, + * the taraget is probably doing an address reset to read uniforms, + * and so we need to be sure that a uniforms address is present in the + * stream, even if the shader didn't need to read uniforms in later + * basic blocks. + */ + bool needs_uniform_address_for_loop; }; static uint32_t @@ -129,11 +161,11 @@ record_texture_sample(struct vc4_validated_shader_info *validated_shader, } static bool -check_tmu_write(uint64_t inst, - struct vc4_validated_shader_info *validated_shader, +check_tmu_write(struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state, bool is_mul) { + uint64_t inst = validation_state->shader[validation_state->ip]; uint32_t waddr = (is_mul ? QPU_GET_FIELD(inst, QPU_WADDR_MUL) : QPU_GET_FIELD(inst, QPU_WADDR_ADD)); @@ -162,7 +194,7 @@ check_tmu_write(uint64_t inst, return false; } - /* We assert that the the clamped address is the first + /* We assert that the clamped address is the first * argument, and the UBO base address is the second argument. * This is arbitrary, but simpler than supporting flipping the * two either way. @@ -212,8 +244,14 @@ check_tmu_write(uint64_t inst, /* Since direct uses a RADDR uniform reference, it will get counted in * check_instruction_reads() */ - if (!is_direct) + if (!is_direct) { + if (validation_state->needs_uniform_address_update) { + DRM_ERROR("Texturing with undefined uniform address\n"); + return false; + } + validated_shader->uniforms_size += 4; + } if (submit) { if (!record_texture_sample(validated_shader, @@ -227,23 +265,144 @@ check_tmu_write(uint64_t inst, return true; } +static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader) +{ + uint32_t o = validated_shader->num_uniform_addr_offsets; + uint32_t num_uniforms = validated_shader->uniforms_size / 4; + + validated_shader->uniform_addr_offsets = + krealloc(validated_shader->uniform_addr_offsets, + (o + 1) * + sizeof(*validated_shader->uniform_addr_offsets), + GFP_KERNEL); + if (!validated_shader->uniform_addr_offsets) + return false; + + validated_shader->uniform_addr_offsets[o] = num_uniforms; + validated_shader->num_uniform_addr_offsets++; + + return true; +} + static bool -check_reg_write(uint64_t inst, - struct vc4_validated_shader_info *validated_shader, +validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + bool is_mul) +{ + uint64_t inst = validation_state->shader[validation_state->ip]; + u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + u32 add_lri = raddr_add_a_to_live_reg_index(inst); + /* We want our reset to be pointing at whatever uniform follows the + * uniforms base address. + */ + u32 expected_offset = validated_shader->uniforms_size + 4; + + /* We only support absolute uniform address changes, and we + * require that they be in the current basic block before any + * of its uniform reads. + * + * One could potentially emit more efficient QPU code, by + * noticing that (say) an if statement does uniform control + * flow for all threads and that the if reads the same number + * of uniforms on each side. However, this scheme is easy to + * validate so it's all we allow for now. + */ + switch (QPU_GET_FIELD(inst, QPU_SIG)) { + case QPU_SIG_NONE: + case QPU_SIG_SCOREBOARD_UNLOCK: + case QPU_SIG_COLOR_LOAD: + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + break; + default: + DRM_ERROR("uniforms address change must be " + "normal math\n"); + return false; + } + + if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { + DRM_ERROR("Uniform address reset must be an ADD.\n"); + return false; + } + + if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) { + DRM_ERROR("Uniform address reset must be unconditional.\n"); + return false; + } + + if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP && + !(inst & QPU_PM)) { + DRM_ERROR("No packing allowed on uniforms reset\n"); + return false; + } + + if (add_lri == -1) { + DRM_ERROR("First argument of uniform address write must be " + "an immediate value.\n"); + return false; + } + + if (validation_state->live_immediates[add_lri] != expected_offset) { + DRM_ERROR("Resetting uniforms with offset %db instead of %db\n", + validation_state->live_immediates[add_lri], + expected_offset); + return false; + } + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { + DRM_ERROR("Second argument of uniform address write must be " + "a uniform.\n"); + return false; + } + + validation_state->needs_uniform_address_update = false; + validation_state->needs_uniform_address_for_loop = false; + return require_uniform_address_uniform(validated_shader); +} + +static bool +check_reg_write(struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state, bool is_mul) { + uint64_t inst = validation_state->shader[validation_state->ip]; uint32_t waddr = (is_mul ? QPU_GET_FIELD(inst, QPU_WADDR_MUL) : QPU_GET_FIELD(inst, QPU_WADDR_ADD)); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + bool ws = inst & QPU_WS; + bool is_b = is_mul ^ ws; + u32 lri = waddr_to_live_reg_index(waddr, is_b); + + if (lri != -1) { + uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); + uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); + + if (sig == QPU_SIG_LOAD_IMM && + QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP && + ((is_mul && cond_mul == QPU_COND_ALWAYS) || + (!is_mul && cond_add == QPU_COND_ALWAYS))) { + validation_state->live_immediates[lri] = + QPU_GET_FIELD(inst, QPU_LOAD_IMM); + } else { + validation_state->live_immediates[lri] = ~0; + } + } switch (waddr) { case QPU_W_UNIFORMS_ADDRESS: - /* XXX: We'll probably need to support this for reladdr, but - * it's definitely a security-related one. - */ - DRM_ERROR("uniforms address load unsupported\n"); - return false; + if (is_b) { + DRM_ERROR("relative uniforms address change " + "unsupported\n"); + return false; + } + + return validate_uniform_address_write(validated_shader, + validation_state, + is_mul); case QPU_W_TLB_COLOR_MS: case QPU_W_TLB_COLOR_ALL: @@ -261,7 +420,7 @@ check_reg_write(uint64_t inst, case QPU_W_TMU1_T: case QPU_W_TMU1_R: case QPU_W_TMU1_B: - return check_tmu_write(inst, validated_shader, validation_state, + return check_tmu_write(validated_shader, validation_state, is_mul); case QPU_W_HOST_INT: @@ -294,10 +453,10 @@ check_reg_write(uint64_t inst, } static void -track_live_clamps(uint64_t inst, - struct vc4_validated_shader_info *validated_shader, +track_live_clamps(struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state) { + uint64_t inst = validation_state->shader[validation_state->ip]; uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); @@ -369,10 +528,10 @@ track_live_clamps(uint64_t inst, } static bool -check_instruction_writes(uint64_t inst, - struct vc4_validated_shader_info *validated_shader, +check_instruction_writes(struct vc4_validated_shader_info *validated_shader, struct vc4_shader_validation_state *validation_state) { + uint64_t inst = validation_state->shader[validation_state->ip]; uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); bool ok; @@ -382,20 +541,44 @@ check_instruction_writes(uint64_t inst, return false; } - ok = (check_reg_write(inst, validated_shader, validation_state, - false) && - check_reg_write(inst, validated_shader, validation_state, - true)); + ok = (check_reg_write(validated_shader, validation_state, false) && + check_reg_write(validated_shader, validation_state, true)); - track_live_clamps(inst, validated_shader, validation_state); + track_live_clamps(validated_shader, validation_state); return ok; } static bool -check_instruction_reads(uint64_t inst, - struct vc4_validated_shader_info *validated_shader) +check_branch(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + int ip) +{ + int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + + if ((int)branch_imm < 0) + validation_state->needs_uniform_address_for_loop = true; + + /* We don't want to have to worry about validation of this, and + * there's no need for it. + */ + if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) { + DRM_ERROR("branch instruction at %d wrote a register.\n", + validation_state->ip); + return false; + } + + return true; +} + +static bool +check_instruction_reads(struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state) { + uint64_t inst = validation_state->shader[validation_state->ip]; uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); @@ -407,40 +590,204 @@ check_instruction_reads(uint64_t inst, * already be OOM. */ validated_shader->uniforms_size += 4; + + if (validation_state->needs_uniform_address_update) { + DRM_ERROR("Uniform read with undefined uniform " + "address\n"); + return false; + } } return true; } +/* Make sure that all branches are absolute and point within the shader, and + * note their targets for later. + */ +static bool +vc4_validate_branches(struct vc4_shader_validation_state *validation_state) +{ + uint32_t max_branch_target = 0; + bool found_shader_end = false; + int ip; + int shader_end_ip = 0; + int last_branch = -2; + + for (ip = 0; ip < validation_state->max_ip; ip++) { + uint64_t inst = validation_state->shader[ip]; + int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + uint32_t after_delay_ip = ip + 4; + uint32_t branch_target_ip; + + if (sig == QPU_SIG_PROG_END) { + shader_end_ip = ip; + found_shader_end = true; + continue; + } + + if (sig != QPU_SIG_BRANCH) + continue; + + if (ip - last_branch < 4) { + DRM_ERROR("Branch at %d during delay slots\n", ip); + return false; + } + last_branch = ip; + + if (inst & QPU_BRANCH_REG) { + DRM_ERROR("branching from register relative " + "not supported\n"); + return false; + } + + if (!(inst & QPU_BRANCH_REL)) { + DRM_ERROR("relative branching required\n"); + return false; + } + + /* The actual branch target is the instruction after the delay + * slots, plus whatever byte offset is in the low 32 bits of + * the instruction. Make sure we're not branching beyond the + * end of the shader object. + */ + if (branch_imm % sizeof(inst) != 0) { + DRM_ERROR("branch target not aligned\n"); + return false; + } + + branch_target_ip = after_delay_ip + (branch_imm >> 3); + if (branch_target_ip >= validation_state->max_ip) { + DRM_ERROR("Branch at %d outside of shader (ip %d/%d)\n", + ip, branch_target_ip, + validation_state->max_ip); + return false; + } + set_bit(branch_target_ip, validation_state->branch_targets); + + /* Make sure that the non-branching path is also not outside + * the shader. + */ + if (after_delay_ip >= validation_state->max_ip) { + DRM_ERROR("Branch at %d continues past shader end " + "(%d/%d)\n", + ip, after_delay_ip, validation_state->max_ip); + return false; + } + set_bit(after_delay_ip, validation_state->branch_targets); + max_branch_target = max(max_branch_target, after_delay_ip); + + /* There are two delay slots after program end is signaled + * that are still executed, then we're finished. + */ + if (found_shader_end && ip == shader_end_ip + 2) + break; + } + + if (max_branch_target > shader_end_ip) { + DRM_ERROR("Branch landed after QPU_SIG_PROG_END"); + return false; + } + + return true; +} + +/* Resets any known state for the shader, used when we may be branched to from + * multiple locations in the program (or at shader start). + */ +static void +reset_validation_state(struct vc4_shader_validation_state *validation_state) +{ + int i; + + for (i = 0; i < 8; i++) + validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0; + + for (i = 0; i < LIVE_REG_COUNT; i++) { + validation_state->live_min_clamp_offsets[i] = ~0; + validation_state->live_max_clamp_regs[i] = false; + validation_state->live_immediates[i] = ~0; + } +} + +static bool +texturing_in_progress(struct vc4_shader_validation_state *validation_state) +{ + return (validation_state->tmu_write_count[0] != 0 || + validation_state->tmu_write_count[1] != 0); +} + +static bool +vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state) +{ + uint32_t ip = validation_state->ip; + + if (!test_bit(ip, validation_state->branch_targets)) + return true; + + if (texturing_in_progress(validation_state)) { + DRM_ERROR("Branch target landed during TMU setup\n"); + return false; + } + + /* Reset our live values tracking, since this instruction may have + * multiple predecessors. + * + * One could potentially do analysis to determine that, for + * example, all predecessors have a live max clamp in the same + * register, but we don't bother with that. + */ + reset_validation_state(validation_state); + + /* Since we've entered a basic block from potentially multiple + * predecessors, we need the uniforms address to be updated before any + * unforms are read. We require that after any branch point, the next + * uniform to be loaded is a uniform address offset. That uniform's + * offset will be marked by the uniform address register write + * validation, or a one-off the end-of-program check. + */ + validation_state->needs_uniform_address_update = true; + + return true; +} + struct vc4_validated_shader_info * vc4_validate_shader(struct drm_gem_cma_object *shader_obj) { bool found_shader_end = false; int shader_end_ip = 0; - uint32_t ip, max_ip; - uint64_t *shader; - struct vc4_validated_shader_info *validated_shader; + uint32_t ip; + struct vc4_validated_shader_info *validated_shader = NULL; struct vc4_shader_validation_state validation_state; - int i; memset(&validation_state, 0, sizeof(validation_state)); + validation_state.shader = shader_obj->vaddr; + validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t); - for (i = 0; i < 8; i++) - validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0; - for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++) - validation_state.live_min_clamp_offsets[i] = ~0; + reset_validation_state(&validation_state); - shader = shader_obj->vaddr; - max_ip = shader_obj->base.size / sizeof(uint64_t); + validation_state.branch_targets = + kcalloc(BITS_TO_LONGS(validation_state.max_ip), + sizeof(unsigned long), GFP_KERNEL); + if (!validation_state.branch_targets) + goto fail; validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL); if (!validated_shader) - return NULL; + goto fail; + + if (!vc4_validate_branches(&validation_state)) + goto fail; - for (ip = 0; ip < max_ip; ip++) { - uint64_t inst = shader[ip]; + for (ip = 0; ip < validation_state.max_ip; ip++) { + uint64_t inst = validation_state.shader[ip]; uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + validation_state.ip = ip; + + if (!vc4_handle_branch_target(&validation_state)) + goto fail; + switch (sig) { case QPU_SIG_NONE: case QPU_SIG_WAIT_FOR_SCOREBOARD: @@ -450,13 +797,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) case QPU_SIG_LOAD_TMU1: case QPU_SIG_PROG_END: case QPU_SIG_SMALL_IMM: - if (!check_instruction_writes(inst, validated_shader, + if (!check_instruction_writes(validated_shader, &validation_state)) { DRM_ERROR("Bad write at ip %d\n", ip); goto fail; } - if (!check_instruction_reads(inst, validated_shader)) + if (!check_instruction_reads(validated_shader, + &validation_state)) goto fail; if (sig == QPU_SIG_PROG_END) { @@ -467,13 +815,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) break; case QPU_SIG_LOAD_IMM: - if (!check_instruction_writes(inst, validated_shader, + if (!check_instruction_writes(validated_shader, &validation_state)) { DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip); goto fail; } break; + case QPU_SIG_BRANCH: + if (!check_branch(inst, validated_shader, + &validation_state, ip)) + goto fail; + break; default: DRM_ERROR("Unsupported QPU signal %d at " "instruction %d\n", sig, ip); @@ -487,13 +840,28 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) break; } - if (ip == max_ip) { + if (ip == validation_state.max_ip) { DRM_ERROR("shader failed to terminate before " "shader BO end at %zd\n", shader_obj->base.size); goto fail; } + /* If we did a backwards branch and we haven't emitted a uniforms + * reset since then, we still need the uniforms stream to have the + * uniforms address available so that the backwards branch can do its + * uniforms reset. + * + * We could potentially prove that the backwards branch doesn't + * contain any uses of uniforms until program exit, but that doesn't + * seem to be worth the trouble. + */ + if (validation_state.needs_uniform_address_for_loop) { + if (!require_uniform_address_uniform(validated_shader)) + goto fail; + validated_shader->uniforms_size += 4; + } + /* Again, no chance of integer overflow here because the worst case * scenario is 8 bytes of uniforms plus handles per 8-byte * instruction. @@ -502,9 +870,12 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) (validated_shader->uniforms_size + 4 * validated_shader->num_texture_samples); + kfree(validation_state.branch_targets); + return validated_shader; fail: + kfree(validation_state.branch_targets); if (validated_shader) { kfree(validated_shader->texture_samples); kfree(validated_shader); -- cgit v1.2.3-54-g00ecf