// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 - Linaro Ltd * Author: Jintack Lim */ #include #include #include #include enum trans_regime { TR_EL10, TR_EL20, TR_EL2, }; struct s1_walk_info { u64 baddr; enum trans_regime regime; unsigned int max_oa_bits; unsigned int pgshift; unsigned int txsz; int sl; bool hpd; bool e0poe; bool poe; bool pan; bool be; bool s2; }; struct s1_walk_result { union { struct { u64 desc; u64 pa; s8 level; u8 APTable; bool UXNTable; bool PXNTable; bool uwxn; bool uov; bool ur; bool uw; bool ux; bool pwxn; bool pov; bool pr; bool pw; bool px; }; struct { u8 fst; bool ptw; bool s2; }; }; bool failed; }; static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) { wr->fst = fst; wr->ptw = ptw; wr->s2 = s2; wr->failed = true; } #define S1_MMU_DISABLED (-127) static int get_ia_size(struct s1_walk_info *wi) { return 64 - wi->txsz; } /* Return true if the IPA is out of the OA range */ static bool check_output_size(u64 ipa, struct s1_walk_info *wi) { return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); } /* Return the translation regime that applies to an AT instruction */ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) { /* * We only get here from guest EL2, so the translation * regime AT applies to is solely defined by {E2H,TGE}. */ switch (op) { case OP_AT_S1E2R: case OP_AT_S1E2W: case OP_AT_S1E2A: return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; break; default: return (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; } } static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { if (!kvm_has_s1pie(vcpu->kvm)) return false; switch (regime) { case TR_EL2: case TR_EL20: return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; case TR_EL10: return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); default: BUG(); } } static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) { u64 val; if (!kvm_has_s1poe(vcpu->kvm)) { wi->poe = wi->e0poe = false; return; } switch (wi->regime) { case TR_EL2: case TR_EL20: val = vcpu_read_sys_reg(vcpu, TCR2_EL2); wi->poe = val & TCR2_EL2_POE; wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); break; case TR_EL10: if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { wi->poe = wi->e0poe = false; return; } val = __vcpu_sys_reg(vcpu, TCR2_EL1); wi->poe = val & TCR2_EL1x_POE; wi->e0poe = val & TCR2_EL1x_E0POE; } } static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; unsigned int stride, x; bool va55, tbi, lva, as_el0; hcr = __vcpu_sys_reg(vcpu, HCR_EL2); wi->regime = compute_translation_regime(vcpu, op); as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); va55 = va & BIT(55); if (wi->regime == TR_EL2 && va55) goto addrsz; wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); switch (wi->regime) { case TR_EL10: sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); ttbr = (va55 ? vcpu_read_sys_reg(vcpu, TTBR1_EL1) : vcpu_read_sys_reg(vcpu, TTBR0_EL1)); break; case TR_EL2: case TR_EL20: sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); ttbr = (va55 ? vcpu_read_sys_reg(vcpu, TTBR1_EL2) : vcpu_read_sys_reg(vcpu, TTBR0_EL2)); break; default: BUG(); } tbi = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_TBI, tcr) : (va55 ? FIELD_GET(TCR_TBI1, tcr) : FIELD_GET(TCR_TBI0, tcr))); if (!tbi && (u64)sign_extend64(va, 55) != va) goto addrsz; va = (u64)sign_extend64(va, 55); /* Let's put the MMU disabled case aside immediately */ switch (wi->regime) { case TR_EL10: /* * If dealing with the EL1&0 translation regime, 3 things * can disable the S1 translation: * * - HCR_EL2.DC = 1 * - HCR_EL2.{E2H,TGE} = {0,1} * - SCTLR_EL1.M = 0 * * The TGE part is interesting. If we have decided that this * is EL1&0, then it means that either {E2H,TGE} == {1,0} or * {0,x}, and we only need to test for TGE == 1. */ if (hcr & (HCR_DC | HCR_TGE)) { wr->level = S1_MMU_DISABLED; break; } fallthrough; case TR_EL2: case TR_EL20: if (!(sctlr & SCTLR_ELx_M)) wr->level = S1_MMU_DISABLED; break; } if (wr->level == S1_MMU_DISABLED) { if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) goto addrsz; wr->pa = va; return 0; } wi->be = sctlr & SCTLR_ELx_EE; wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); wi->hpd &= (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_HPD, tcr) : (va55 ? FIELD_GET(TCR_HPD1, tcr) : FIELD_GET(TCR_HPD0, tcr))); /* R_JHSVW */ wi->hpd |= s1pie_enabled(vcpu, wi->regime); /* Do we have POE? */ compute_s1poe(vcpu, wi); /* R_BVXDG */ wi->hpd |= (wi->poe || wi->e0poe); /* Someone was silly enough to encode TG0/TG1 differently */ if (va55) { wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); tg = FIELD_GET(TCR_TG1_MASK, tcr); switch (tg << TCR_TG1_SHIFT) { case TCR_TG1_4K: wi->pgshift = 12; break; case TCR_TG1_16K: wi->pgshift = 14; break; case TCR_TG1_64K: default: /* IMPDEF: treat any other value as 64k */ wi->pgshift = 16; break; } } else { wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); tg = FIELD_GET(TCR_TG0_MASK, tcr); switch (tg << TCR_TG0_SHIFT) { case TCR_TG0_4K: wi->pgshift = 12; break; case TCR_TG0_16K: wi->pgshift = 14; break; case TCR_TG0_64K: default: /* IMPDEF: treat any other value as 64k */ wi->pgshift = 16; break; } } /* R_PLCGL, R_YXNYW */ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { if (wi->txsz > 39) goto transfault_l0; } else { if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) goto transfault_l0; } /* R_GTJBY, R_SXWGM */ switch (BIT(wi->pgshift)) { case SZ_4K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); break; case SZ_16K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); break; case SZ_64K: lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); break; } if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) goto transfault_l0; ia_bits = get_ia_size(wi); /* R_YYVYV, I_THCZK */ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || (va55 && va < GENMASK(63, ia_bits))) goto transfault_l0; /* I_ZFSYQ */ if (wi->regime != TR_EL2 && (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) goto transfault_l0; /* R_BNDVG and following statements */ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) goto transfault_l0; /* AArch64.S1StartLevel() */ stride = wi->pgshift - 3; wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); ps = (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); /* Compute minimal alignment */ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); wi->baddr = ttbr & TTBRx_EL1_BADDR; /* R_VPBBF */ if (check_output_size(wi->baddr, wi)) goto addrsz; wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); return 0; addrsz: /* Address Size Fault level 0 */ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); return -EFAULT; transfault_l0: /* Translation Fault level 0 */ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); return -EFAULT; } static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { u64 va_top, va_bottom, baddr, desc; int level, stride, ret; level = wi->sl; stride = wi->pgshift - 3; baddr = wi->baddr; va_top = get_ia_size(wi) - 1; while (1) { u64 index, ipa; va_bottom = (3 - level) * stride + wi->pgshift; index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); ipa = baddr | index; if (wi->s2) { struct kvm_s2_trans s2_trans = {}; ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); if (ret) { fail_s1_walk(wr, (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, true, true); return ret; } if (!kvm_s2_trans_readable(&s2_trans)) { fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true, true); return -EPERM; } ipa = kvm_s2_trans_output(&s2_trans); } ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), true, false); return ret; } if (wi->be) desc = be64_to_cpu((__force __be64)desc); else desc = le64_to_cpu((__force __le64)desc); /* Invalid descriptor */ if (!(desc & BIT(0))) goto transfault; /* Block mapping, check validity down the line */ if (!(desc & BIT(1))) break; /* Page mapping */ if (level == 3) break; /* Table handling */ if (!wi->hpd) { wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); } baddr = desc & GENMASK_ULL(47, wi->pgshift); /* Check for out-of-range OA */ if (check_output_size(baddr, wi)) goto addrsz; /* Prepare for next round */ va_top = va_bottom - 1; level++; } /* Block mapping, check the validity of the level */ if (!(desc & BIT(1))) { bool valid_block = false; switch (BIT(wi->pgshift)) { case SZ_4K: valid_block = level == 1 || level == 2; break; case SZ_16K: case SZ_64K: valid_block = level == 2; break; } if (!valid_block) goto transfault; } if (check_output_size(desc & GENMASK(47, va_bottom), wi)) goto addrsz; va_bottom += contiguous_bit_shift(desc, wi, level); wr->failed = false; wr->level = level; wr->desc = desc; wr->pa = desc & GENMASK(47, va_bottom); wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); return 0; addrsz: fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); return -EINVAL; transfault: fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); return -ENOENT; } struct mmu_config { u64 ttbr0; u64 ttbr1; u64 tcr; u64 mair; u64 tcr2; u64 pir; u64 pire0; u64 por_el0; u64 por_el1; u64 sctlr; u64 vttbr; u64 vtcr; u64 hcr; }; static void __mmu_config_save(struct mmu_config *config) { config->ttbr0 = read_sysreg_el1(SYS_TTBR0); config->ttbr1 = read_sysreg_el1(SYS_TTBR1); config->tcr = read_sysreg_el1(SYS_TCR); config->mair = read_sysreg_el1(SYS_MAIR); if (cpus_have_final_cap(ARM64_HAS_TCR2)) { config->tcr2 = read_sysreg_el1(SYS_TCR2); if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { config->pir = read_sysreg_el1(SYS_PIR); config->pire0 = read_sysreg_el1(SYS_PIRE0); } if (system_supports_poe()) { config->por_el1 = read_sysreg_el1(SYS_POR); config->por_el0 = read_sysreg_s(SYS_POR_EL0); } } config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); config->hcr = read_sysreg(hcr_el2); } static void __mmu_config_restore(struct mmu_config *config) { write_sysreg(config->hcr, hcr_el2); /* * ARM errata 1165522 and 1530923 require TGE to be 1 before * we update the guest state. */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); write_sysreg_el1(config->ttbr0, SYS_TTBR0); write_sysreg_el1(config->ttbr1, SYS_TTBR1); write_sysreg_el1(config->tcr, SYS_TCR); write_sysreg_el1(config->mair, SYS_MAIR); if (cpus_have_final_cap(ARM64_HAS_TCR2)) { write_sysreg_el1(config->tcr2, SYS_TCR2); if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { write_sysreg_el1(config->pir, SYS_PIR); write_sysreg_el1(config->pire0, SYS_PIRE0); } if (system_supports_poe()) { write_sysreg_el1(config->por_el1, SYS_POR); write_sysreg_s(config->por_el0, SYS_POR_EL0); } } write_sysreg_el1(config->sctlr, SYS_SCTLR); write_sysreg(config->vttbr, vttbr_el2); write_sysreg(config->vtcr, vtcr_el2); } static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 host_pan; bool fail; host_pan = read_sysreg_s(SYS_PSTATE_PAN); write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); switch (op) { case OP_AT_S1E1RP: fail = __kvm_at(OP_AT_S1E1RP, vaddr); break; case OP_AT_S1E1WP: fail = __kvm_at(OP_AT_S1E1WP, vaddr); break; } write_sysreg_s(host_pan, SYS_PSTATE_PAN); return fail; } #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) #define MEMATTR_NC 0b0100 #define MEMATTR_Wt 0b1000 #define MEMATTR_Wb 0b1100 #define MEMATTR_WbRaWa 0b1111 #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) static u8 s2_memattr_to_attr(u8 memattr) { memattr &= 0b1111; switch (memattr) { case 0b0000: case 0b0001: case 0b0010: case 0b0011: return memattr << 2; case 0b0100: return MEMATTR(Wb, Wb); case 0b0101: return MEMATTR(NC, NC); case 0b0110: return MEMATTR(Wt, NC); case 0b0111: return MEMATTR(Wb, NC); case 0b1000: /* Reserved, assume NC */ return MEMATTR(NC, NC); case 0b1001: return MEMATTR(NC, Wt); case 0b1010: return MEMATTR(Wt, Wt); case 0b1011: return MEMATTR(Wb, Wt); case 0b1100: /* Reserved, assume NC */ return MEMATTR(NC, NC); case 0b1101: return MEMATTR(NC, Wb); case 0b1110: return MEMATTR(Wt, Wb); case 0b1111: return MEMATTR(Wb, Wb); default: unreachable(); } } static u8 combine_s1_s2_attr(u8 s1, u8 s2) { bool transient; u8 final = 0; /* Upgrade transient s1 to non-transient to simplify things */ switch (s1) { case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ transient = true; s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); break; case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ transient = true; s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); break; default: transient = false; } /* S2CombineS1AttrHints() */ if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || (s2 & GENMASK(3, 2)) == MEMATTR_NC) final = MEMATTR_NC; else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || (s2 & GENMASK(3, 2)) == MEMATTR_Wt) final = MEMATTR_Wt; else final = MEMATTR_Wb; if (final != MEMATTR_NC) { /* Inherit RaWa hints form S1 */ if (transient) { switch (s1 & GENMASK(3, 2)) { case MEMATTR_Wt: final = 0; break; case MEMATTR_Wb: final = MEMATTR_NC; break; } } final |= s1 & GENMASK(1, 0); } return final; } #define ATTR_NSH 0b00 #define ATTR_RSV 0b01 #define ATTR_OSH 0b10 #define ATTR_ISH 0b11 static u8 compute_sh(u8 attr, u64 desc) { u8 sh; /* Any form of device, as well as NC has SH[1:0]=0b10 */ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) return ATTR_OSH; sh = FIELD_GET(PTE_SHARED, desc); if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ sh = ATTR_NSH; return sh; } static u8 combine_sh(u8 s1_sh, u8 s2_sh) { if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) return ATTR_OSH; if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) return ATTR_ISH; return ATTR_NSH; } static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, struct kvm_s2_trans *tr) { u8 s1_parattr, s2_memattr, final_attr; u64 par; /* If S2 has failed to translate, report the damage */ if (tr->esr) { par = SYS_PAR_EL1_RES1; par |= SYS_PAR_EL1_F; par |= SYS_PAR_EL1_S; par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); return par; } s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) s2_memattr &= ~BIT(3); /* Combination of R_VRJSW and R_RHWZM */ switch (s2_memattr) { case 0b0101: if (MEMATTR_IS_DEVICE(s1_parattr)) final_attr = s1_parattr; else final_attr = MEMATTR(NC, NC); break; case 0b0110: case 0b1110: final_attr = MEMATTR(WbRaWa, WbRaWa); break; case 0b0111: case 0b1111: /* Preserve S1 attribute */ final_attr = s1_parattr; break; case 0b0100: case 0b1100: case 0b1101: /* Reserved, do something non-silly */ final_attr = s1_parattr; break; default: /* * MemAttr[2]=0, Device from S2. * * FWB does not influence the way that stage 1 * memory types and attributes are combined * with stage 2 Device type and attributes. */ final_attr = min(s2_memattr_to_attr(s2_memattr), s1_parattr); } } else { /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ u8 s2_parattr = s2_memattr_to_attr(s2_memattr); if (MEMATTR_IS_DEVICE(s1_parattr) || MEMATTR_IS_DEVICE(s2_parattr)) { final_attr = min(s1_parattr, s2_parattr); } else { /* At this stage, this is memory vs memory */ final_attr = combine_s1_s2_attr(s1_parattr & 0xf, s2_parattr & 0xf); final_attr |= combine_s1_s2_attr(s1_parattr >> 4, s2_parattr >> 4) << 4; } } if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && !MEMATTR_IS_DEVICE(final_attr)) final_attr = MEMATTR(NC, NC); par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); par |= tr->output & GENMASK(47, 12); par |= FIELD_PREP(SYS_PAR_EL1_SH, combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), compute_sh(final_attr, tr->desc))); return par; } static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, enum trans_regime regime) { u64 par; if (wr->failed) { par = SYS_PAR_EL1_RES1; par |= SYS_PAR_EL1_F; par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; par |= wr->s2 ? SYS_PAR_EL1_S : 0; } else if (wr->level == S1_MMU_DISABLED) { /* MMU off or HCR_EL2.DC == 1 */ par = SYS_PAR_EL1_NSE; par |= wr->pa & GENMASK_ULL(47, 12); if (regime == TR_EL10 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { par |= FIELD_PREP(SYS_PAR_EL1_ATTR, MEMATTR(WbRaWa, WbRaWa)); par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); } else { par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); } } else { u64 mair, sctlr; u8 sh; par = SYS_PAR_EL1_NSE; mair = (regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, MAIR_EL1) : vcpu_read_sys_reg(vcpu, MAIR_EL2)); mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; mair &= 0xff; sctlr = (regime == TR_EL10 ? vcpu_read_sys_reg(vcpu, SCTLR_EL1) : vcpu_read_sys_reg(vcpu, SCTLR_EL2)); /* Force NC for memory if SCTLR_ELx.C is clear */ if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) mair = MEMATTR(NC, NC); par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); par |= wr->pa & GENMASK_ULL(47, 12); sh = compute_sh(mair, wr->desc); par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); } return par; } static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) { u64 sctlr; if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) return false; if (s1pie_enabled(vcpu, regime)) return true; if (regime == TR_EL10) sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); else sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); return sctlr & SCTLR_EL1_EPAN; } static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { bool wxn; /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ if (wi->regime != TR_EL2) { switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { case 0b00: wr->pr = wr->pw = true; wr->ur = wr->uw = false; break; case 0b01: wr->pr = wr->pw = wr->ur = wr->uw = true; break; case 0b10: wr->pr = true; wr->pw = wr->ur = wr->uw = false; break; case 0b11: wr->pr = wr->ur = true; wr->pw = wr->uw = false; break; } /* We don't use px for anything yet, but hey... */ wr->px = !((wr->desc & PTE_PXN) || wr->uw); wr->ux = !(wr->desc & PTE_UXN); } else { wr->ur = wr->uw = wr->ux = false; if (!(wr->desc & PTE_RDONLY)) { wr->pr = wr->pw = true; } else { wr->pr = true; wr->pw = false; } /* XN maps to UXN */ wr->px = !(wr->desc & PTE_UXN); } switch (wi->regime) { case TR_EL2: case TR_EL20: wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); break; case TR_EL10: wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); break; } wr->pwxn = wr->uwxn = wxn; wr->pov = wi->poe; wr->uov = wi->e0poe; } static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { /* Hierarchical part of AArch64.S1DirectBasePermissions() */ if (wi->regime != TR_EL2) { switch (wr->APTable) { case 0b00: break; case 0b01: wr->ur = wr->uw = false; break; case 0b10: wr->pw = wr->uw = false; break; case 0b11: wr->pw = wr->ur = wr->uw = false; break; } wr->px &= !wr->PXNTable; wr->ux &= !wr->UXNTable; } else { if (wr->APTable & BIT(1)) wr->pw = false; /* XN maps to UXN */ wr->px &= !wr->UXNTable; } } #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) #define set_priv_perms(wr, r, w, x) \ do { \ (wr)->pr = (r); \ (wr)->pw = (w); \ (wr)->px = (x); \ } while (0) #define set_unpriv_perms(wr, r, w, x) \ do { \ (wr)->ur = (r); \ (wr)->uw = (w); \ (wr)->ux = (x); \ } while (0) #define set_priv_wxn(wr, v) \ do { \ (wr)->pwxn = (v); \ } while (0) #define set_unpriv_wxn(wr, v) \ do { \ (wr)->uwxn = (v); \ } while (0) /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ #define set_perms(w, wr, ip) \ do { \ /* R_LLZDZ */ \ switch ((ip)) { \ case 0b0000: \ set_ ## w ## _perms((wr), false, false, false); \ break; \ case 0b0001: \ set_ ## w ## _perms((wr), true , false, false); \ break; \ case 0b0010: \ set_ ## w ## _perms((wr), false, false, true ); \ break; \ case 0b0011: \ set_ ## w ## _perms((wr), true , false, true ); \ break; \ case 0b0100: \ set_ ## w ## _perms((wr), false, false, false); \ break; \ case 0b0101: \ set_ ## w ## _perms((wr), true , true , false); \ break; \ case 0b0110: \ set_ ## w ## _perms((wr), true , true , true ); \ break; \ case 0b0111: \ set_ ## w ## _perms((wr), true , true , true ); \ break; \ case 0b1000: \ set_ ## w ## _perms((wr), true , false, false); \ break; \ case 0b1001: \ set_ ## w ## _perms((wr), true , false, false); \ break; \ case 0b1010: \ set_ ## w ## _perms((wr), true , false, true ); \ break; \ case 0b1011: \ set_ ## w ## _perms((wr), false, false, false); \ break; \ case 0b1100: \ set_ ## w ## _perms((wr), true , true , false); \ break; \ case 0b1101: \ set_ ## w ## _perms((wr), false, false, false); \ break; \ case 0b1110: \ set_ ## w ## _perms((wr), true , true , true ); \ break; \ case 0b1111: \ set_ ## w ## _perms((wr), false, false, false); \ break; \ } \ \ /* R_HJYGR */ \ set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ \ } while (0) static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { u8 up, pp, idx; idx = pte_pi_index(wr->desc); switch (wi->regime) { case TR_EL10: pp = perm_idx(vcpu, PIR_EL1, idx); up = perm_idx(vcpu, PIRE0_EL1, idx); break; case TR_EL20: pp = perm_idx(vcpu, PIR_EL2, idx); up = perm_idx(vcpu, PIRE0_EL2, idx); break; case TR_EL2: pp = perm_idx(vcpu, PIR_EL2, idx); up = 0; break; } set_perms(priv, wr, pp); if (wi->regime != TR_EL2) set_perms(unpriv, wr, up); else set_unpriv_perms(wr, false, false, false); wr->pov = wi->poe && !(pp & BIT(3)); wr->uov = wi->e0poe && !(up & BIT(3)); /* R_VFPJF */ if (wr->px && wr->uw) { set_priv_perms(wr, false, false, false); set_unpriv_perms(wr, false, false, false); } } static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { u8 idx, pov_perms, uov_perms; idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); switch (wi->regime) { case TR_EL10: pov_perms = perm_idx(vcpu, POR_EL1, idx); uov_perms = perm_idx(vcpu, POR_EL0, idx); break; case TR_EL20: pov_perms = perm_idx(vcpu, POR_EL2, idx); uov_perms = perm_idx(vcpu, POR_EL0, idx); break; case TR_EL2: pov_perms = perm_idx(vcpu, POR_EL2, idx); uov_perms = 0; break; } if (pov_perms & ~POE_RXW) pov_perms = POE_NONE; if (wi->poe && wr->pov) { wr->pr &= pov_perms & POE_R; wr->px &= pov_perms & POE_X; wr->pw &= pov_perms & POE_W; } if (uov_perms & ~POE_RXW) uov_perms = POE_NONE; if (wi->e0poe && wr->uov) { wr->ur &= uov_perms & POE_R; wr->ux &= uov_perms & POE_X; wr->uw &= uov_perms & POE_W; } } static void compute_s1_permissions(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr) { bool pan; if (!s1pie_enabled(vcpu, wi->regime)) compute_s1_direct_permissions(vcpu, wi, wr); else compute_s1_indirect_permissions(vcpu, wi, wr); if (!wi->hpd) compute_s1_hierarchical_permissions(vcpu, wi, wr); if (wi->poe || wi->e0poe) compute_s1_overlay_permissions(vcpu, wi, wr); /* R_QXXPC */ if (wr->pwxn) { if (!wr->pov && wr->pw) wr->px = false; if (wr->pov && wr->px) wr->pw = false; } /* R_NPBXC */ if (wr->uwxn) { if (!wr->uov && wr->uw) wr->ux = false; if (wr->uov && wr->ux) wr->uw = false; } pan = wi->pan && (wr->ur || wr->uw || (pan3_enabled(vcpu, wi->regime) && wr->ux)); wr->pw &= !pan; wr->pr &= !pan; } static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; bool perm_fail = false; int ret, idx; ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); if (ret) goto compute_par; if (wr.level == S1_MMU_DISABLED) goto compute_par; idx = srcu_read_lock(&vcpu->kvm->srcu); ret = walk_s1(vcpu, &wi, &wr, vaddr); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (ret) goto compute_par; compute_s1_permissions(vcpu, &wi, &wr); switch (op) { case OP_AT_S1E1RP: case OP_AT_S1E1R: case OP_AT_S1E2R: perm_fail = !wr.pr; break; case OP_AT_S1E1WP: case OP_AT_S1E1W: case OP_AT_S1E2W: perm_fail = !wr.pw; break; case OP_AT_S1E0R: perm_fail = !wr.ur; break; case OP_AT_S1E0W: perm_fail = !wr.uw; break; case OP_AT_S1E1A: case OP_AT_S1E2A: break; default: BUG(); } if (perm_fail) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); compute_par: return compute_par_s1(vcpu, &wr, wi.regime); } /* * Return the PAR_EL1 value as the result of a valid translation. * * If the translation is unsuccessful, the value may only contain * PAR_EL1.F, and cannot be taken at face value. It isn't an * indication of the translation having failed, only that the fast * path did not succeed, *unless* it indicates a S1 permission fault. */ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct mmu_config config; struct kvm_s2_mmu *mmu; bool fail; u64 par; par = SYS_PAR_EL1_F; /* * We've trapped, so everything is live on the CPU. As we will * be switching contexts behind everybody's back, disable * interrupts while holding the mmu lock. */ guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); /* * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already * the right one (as we trapped from vEL2). If not, save the * full MMU context. */ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) goto skip_mmu_switch; /* * Obtaining the S2 MMU for a L2 is horribly racy, and we may not * find it (recycled by another vcpu, for example). When this * happens, admit defeat immediately and use the SW (slow) path. */ mmu = lookup_s2_mmu(vcpu); if (!mmu) return par; __mmu_config_save(&config); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); if (kvm_has_tcr2(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); if (kvm_has_s1pie(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); } if (kvm_has_s1poe(vcpu->kvm)) { write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); } } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); __load_stage2(mmu, mmu->arch); skip_mmu_switch: /* Clear TGE, enable S2 translation, we're rolling */ write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); isb(); switch (op) { case OP_AT_S1E1RP: case OP_AT_S1E1WP: fail = at_s1e1p_fast(vcpu, op, vaddr); break; case OP_AT_S1E1R: fail = __kvm_at(OP_AT_S1E1R, vaddr); break; case OP_AT_S1E1W: fail = __kvm_at(OP_AT_S1E1W, vaddr); break; case OP_AT_S1E0R: fail = __kvm_at(OP_AT_S1E0R, vaddr); break; case OP_AT_S1E0W: fail = __kvm_at(OP_AT_S1E0W, vaddr); break; case OP_AT_S1E1A: fail = __kvm_at(OP_AT_S1E1A, vaddr); break; default: WARN_ON_ONCE(1); fail = true; break; } if (!fail) par = read_sysreg_par(); if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) __mmu_config_restore(&config); return par; } static bool par_check_s1_perm_fault(u64 par) { u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && !(par & SYS_PAR_EL1_S)); } void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); /* * If PAR_EL1 reports that AT failed on a S1 permission fault, we * know for sure that the PTW was able to walk the S1 tables and * there's nothing else to do. * * If AT failed for any other reason, then we must walk the guest S1 * to emulate the instruction. */ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) par = handle_at_slow(vcpu, op, vaddr); vcpu_write_sys_reg(vcpu, par, PAR_EL1); } void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par; /* * We've trapped, so everything is live on the CPU. As we will be * switching context behind everybody's back, disable interrupts... */ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { u64 val, hcr; bool fail; val = hcr = read_sysreg(hcr_el2); val &= ~HCR_TGE; val |= HCR_VM; if (!vcpu_el2_e2h_is_set(vcpu)) val |= HCR_NV | HCR_NV1; write_sysreg(val, hcr_el2); isb(); par = SYS_PAR_EL1_F; switch (op) { case OP_AT_S1E2R: fail = __kvm_at(OP_AT_S1E1R, vaddr); break; case OP_AT_S1E2W: fail = __kvm_at(OP_AT_S1E1W, vaddr); break; case OP_AT_S1E2A: fail = __kvm_at(OP_AT_S1E1A, vaddr); break; default: WARN_ON_ONCE(1); fail = true; } isb(); if (!fail) par = read_sysreg_par(); write_sysreg(hcr, hcr_el2); isb(); } /* We failed the translation, let's replay it in slow motion */ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) par = handle_at_slow(vcpu, op, vaddr); vcpu_write_sys_reg(vcpu, par, PAR_EL1); } void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct kvm_s2_trans out = {}; u64 ipa, par; bool write; int ret; /* Do the stage-1 translation */ switch (op) { case OP_AT_S12E1R: op = OP_AT_S1E1R; write = false; break; case OP_AT_S12E1W: op = OP_AT_S1E1W; write = true; break; case OP_AT_S12E0R: op = OP_AT_S1E0R; write = false; break; case OP_AT_S12E0W: op = OP_AT_S1E0W; write = true; break; default: WARN_ON_ONCE(1); return; } __kvm_at_s1e01(vcpu, op, vaddr); par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) return; /* * If we only have a single stage of translation (E2H=0 or * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. */ if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) return; /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) return; /* Check the access permission */ if (!out.esr && ((!write && !out.readable) || (write && !out.writable))) out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); }