/* * FPU data structures: */ #ifndef _ASM_X86_FPU_H #define _ASM_X86_FPU_H /* * The legacy x87 FPU state format, as saved by FSAVE and * restored by the FRSTOR instructions: */ struct fregs_state { u32 cwd; /* FPU Control Word */ u32 swd; /* FPU Status Word */ u32 twd; /* FPU Tag Word */ u32 fip; /* FPU IP Offset */ u32 fcs; /* FPU IP Selector */ u32 foo; /* FPU Operand Pointer Offset */ u32 fos; /* FPU Operand Pointer Selector */ /* 8*10 bytes for each FP-reg = 80 bytes: */ u32 st_space[20]; /* Software status information [not touched by FSAVE]: */ u32 status; }; /* * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and * restored by the FXRSTOR instructions. It's similar to the FSAVE * format, but differs in some areas, plus has extensions at * the end for the XMM registers. */ struct fxregs_state { u16 cwd; /* Control Word */ u16 swd; /* Status Word */ u16 twd; /* Tag Word */ u16 fop; /* Last Instruction Opcode */ union { struct { u64 rip; /* Instruction Pointer */ u64 rdp; /* Data Pointer */ }; struct { u32 fip; /* FPU IP Offset */ u32 fcs; /* FPU IP Selector */ u32 foo; /* FPU Operand Offset */ u32 fos; /* FPU Operand Selector */ }; }; u32 mxcsr; /* MXCSR Register State */ u32 mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes: */ u32 st_space[32]; /* 16*16 bytes for each XMM-reg = 256 bytes: */ u32 xmm_space[64]; u32 padding[12]; union { u32 padding1[12]; u32 sw_reserved[12]; }; } __attribute__((aligned(16))); /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: */ struct swregs_state { u32 cwd; u32 swd; u32 twd; u32 fip; u32 fcs; u32 foo; u32 fos; /* 8*10 bytes for each FP-reg = 80 bytes: */ u32 st_space[20]; u8 ftop; u8 changed; u8 lookahead; u8 no_update; u8 rm; u8 alimit; struct math_emu_info *info; u32 entry_eip; }; /* * List of XSAVE features Linux knows about: */ enum xfeature_bit { XSTATE_BIT_FP, XSTATE_BIT_SSE, XSTATE_BIT_YMM, XSTATE_BIT_BNDREGS, XSTATE_BIT_BNDCSR, XSTATE_BIT_OPMASK, XSTATE_BIT_ZMM_Hi256, XSTATE_BIT_Hi16_ZMM, XFEATURES_NR_MAX, }; #define XSTATE_FP (1 << XSTATE_BIT_FP) #define XSTATE_SSE (1 << XSTATE_BIT_SSE) #define XSTATE_YMM (1 << XSTATE_BIT_YMM) #define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) #define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) #define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) #define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) #define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) #define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* * There are 16x 256-bit AVX registers named YMM0-YMM15. * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) * and are stored in 'struct fxregs_state::xmm_space[]'. * * The high 128 bits are stored here: * 16x 128 bits == 256 bytes. */ struct ymmh_struct { u8 ymmh_space[256]; }; /* We don't support LWP yet: */ struct lwp_struct { u8 reserved[128]; }; /* Intel MPX support: */ struct bndreg { u64 lower_bound; u64 upper_bound; } __packed; struct bndcsr { u64 bndcfgu; u64 bndstatus; } __packed; struct mpx_struct { struct bndreg bndreg[4]; struct bndcsr bndcsr; }; struct xstate_header { u64 xfeatures; u64 xcomp_bv; u64 reserved[6]; } __attribute__((packed)); /* New processor state extensions should be added here: */ #define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ sizeof(struct lwp_struct) + \ sizeof(struct mpx_struct) ) /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. * * It consists of a legacy fxregs portion, an xstate header and * subsequent fixed size areas as defined by the xstate header. * Not all CPUs support all the extensions. */ struct xregs_state { struct fxregs_state i387; struct xstate_header header; u8 __reserved[XSTATE_RESERVE]; } __attribute__ ((packed, aligned (64))); /* * This is a union of all the possible FPU state formats * put together, so that we can pick the right one runtime. * * The size of the structure is determined by the largest * member - which is the xsave area: */ union fpregs_state { struct fregs_state fsave; struct fxregs_state fxsave; struct swregs_state soft; struct xregs_state xsave; u8 __padding[PAGE_SIZE]; }; /* * Highest level per task FPU state data structure that * contains the FPU register state plus various FPU * state fields: */ struct fpu { /* * @last_cpu: * * Records the last CPU on which this context was loaded into * FPU registers. (In the lazy-restore case we might be * able to reuse FPU registers across multiple context switches * this way, if no intermediate task used the FPU.) * * A value of -1 is used to indicate that the FPU state in context * memory is newer than the FPU state in registers, and that the * FPU state should be reloaded next time the task is run. */ unsigned int last_cpu; /* * @fpstate_active: * * This flag indicates whether this context is active: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ unsigned char fpstate_active; /* * @fpregs_active: * * This flag determines whether a given context is actively * loaded into the FPU's registers and that those registers * represent the task's current FPU state. * * Note the interaction with fpstate_active: * * # task does not use the FPU: * fpstate_active == 0 * * # task uses the FPU and regs are active: * fpstate_active == 1 && fpregs_active == 1 * * # the regs are inactive but still match fpstate: * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu * * The third state is what we use for the lazy restore optimization * on lazy-switching CPUs. */ unsigned char fpregs_active; /* * @counter: * * This counter contains the number of consecutive context switches * during which the FPU stays used. If this is over a threshold, the * lazy FPU restore logic becomes eager, to save the trap overhead. * This is an unsigned char so that after 256 iterations the counter * wraps and the context switch behavior turns lazy again; this is to * deal with bursty apps that only use the FPU for a short time: */ unsigned char counter; /* * @state: * * In-memory copy of all FPU registers that we save/restore * over context switches. If the task is using the FPU then * the registers in the FPU are more recent than this state * copy. If the task context-switches away then they get * saved here and represent the FPU state. * * After context switches there may be a (short) time period * during which the in-FPU hardware registers are unchanged * and still perfectly match this state, if the tasks * scheduled afterwards are not using the FPU. * * This is the 'lazy restore' window of optimization, which * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. * * We detect whether a subsequent task uses the FPU via setting * CR0::TS to 1, which causes any FPU use to raise a #NM fault. * * During this window, if the task gets scheduled again, we * might be able to skip having to do a restore from this * memory buffer to the hardware registers - at the cost of * incurring the overhead of #NM fault traps. * * Note that on modern CPUs that support the XSAVEOPT (or other * optimized XSAVE instructions), we don't use #NM traps anymore, * as the hardware can track whether FPU registers need saving * or not. On such CPUs we activate the non-lazy ('eagerfpu') * logic, which unconditionally saves/restores all FPU state * across context switches. (if FPU state exists.) */ union fpregs_state state; /* * WARNING: 'state' is dynamically-sized. Do not put * anything after it here. */ }; #endif /* _ASM_X86_FPU_H */