| From b3fcc7d96523ad8e3ea28c09d495ef08529d01ce Mon Sep 17 00:00:00 2001 |
| From: Victor Kamensky <kamensky@cisco.com> |
| Date: Wed, 7 Oct 2020 10:19:42 -0700 |
| Subject: [PATCH] mips: add 34Kf-64tlb fictitious cpu type like 34Kf but with |
| 64 TLBs |
| |
| In Yocto Project CI runs it was observed that test run |
| of 32 bit mips image takes almost twice longer than 64 bit |
| mips image with the same logical load and CI execution |
| hits timeout. |
| |
| See https://bugzilla.yoctoproject.org/show_bug.cgi?id=13992 |
| |
| Yocto project uses 34Kf cpu type to run 32 bit mips image, |
| and MIPS64R2-generic cpu type to run 64 bit mips64 image. |
| |
| Upon qemu behavior differences investigation between mips |
| and mips64 two prominent observations came up: under |
| logically similar load (same definition and configuration |
| of user-land image) in case of mips get_physical_address |
| function is called almost twice more often, meaning |
| twice more memory accesses involved in this case. Also |
| number of tlbwr instruction executed (r4k_helper_tlbwr |
| qemu function) almost 16 time bigger in mips case than in |
| mips64. |
| |
| It turns out that 34Kf cpu has 16 TLBs, but in case of |
| MIPS64R2-generic it is 64 TLBs. So that explains why |
| some many more tlbwr had to be execute by kernel TLB refill |
| handler in case of 32 bit misp. |
| |
| The idea of the fix is to come up with new 34Kf-64tlb fictitious |
| cpu type, that would behave exactly as 34Kf but it would |
| contain 64 TLBs to reduce TLB trashing. After all, adding |
| more TLBs to soft mmu is easy. |
| |
| Experiment with some significant non-trvial load in Yocto |
| environment by running do_testimage load shows that 34Kf-64tlb |
| cpu performs 40% or so better than original 34Kf cpu wrt test |
| execution real time. |
| |
| It is not ideal to have cpu type that does not exist in the |
| wild but given performance gains it seems to be justified. |
| |
| Signed-off-by: Victor Kamensky <kamensky@cisco.com> |
| --- |
| target/mips/translate_init.inc.c | 55 ++++++++++++++++++++++++++++++++++++++++ |
| 1 file changed, 55 insertions(+) |
| |
| diff --git a/target/mips/translate_init.inc.c b/target/mips/translate_init.inc.c |
| index 637caccd89..b73ab48231 100644 |
| --- a/target/mips/translate_init.inc.c |
| +++ b/target/mips/translate_init.inc.c |
| @@ -297,6 +297,61 @@ const mips_def_t mips_defs[] = |
| .insn_flags = CPU_MIPS32R2 | ASE_MIPS16 | ASE_DSP | ASE_MT, |
| .mmu_type = MMU_TYPE_R4000, |
| }, |
| + /* |
| + * Verbatim copy of "34Kf" cpu, only bumped up number of TLB entries |
| + * from 16 to 64 (see CP0_Config0 value at CP0C1_MMU bits) to improve |
| + * performance by reducing number of TLB refill exceptions and |
| + * eliminating need to run all corresponding TLB refill handling |
| + * instructions. |
| + */ |
| + { |
| + .name = "34Kf-64tlb", |
| + .CP0_PRid = 0x00019500, |
| + .CP0_Config0 = MIPS_CONFIG0 | (0x1 << CP0C0_AR) | |
| + (MMU_TYPE_R4000 << CP0C0_MT), |
| + .CP0_Config1 = MIPS_CONFIG1 | (1 << CP0C1_FP) | (63 << CP0C1_MMU) | |
| + (0 << CP0C1_IS) | (3 << CP0C1_IL) | (1 << CP0C1_IA) | |
| + (0 << CP0C1_DS) | (3 << CP0C1_DL) | (1 << CP0C1_DA) | |
| + (1 << CP0C1_CA), |
| + .CP0_Config2 = MIPS_CONFIG2, |
| + .CP0_Config3 = MIPS_CONFIG3 | (1 << CP0C3_VInt) | (1 << CP0C3_MT) | |
| + (1 << CP0C3_DSPP), |
| + .CP0_LLAddr_rw_bitmask = 0, |
| + .CP0_LLAddr_shift = 0, |
| + .SYNCI_Step = 32, |
| + .CCRes = 2, |
| + .CP0_Status_rw_bitmask = 0x3778FF1F, |
| + .CP0_TCStatus_rw_bitmask = (0 << CP0TCSt_TCU3) | (0 << CP0TCSt_TCU2) | |
| + (1 << CP0TCSt_TCU1) | (1 << CP0TCSt_TCU0) | |
| + (0 << CP0TCSt_TMX) | (1 << CP0TCSt_DT) | |
| + (1 << CP0TCSt_DA) | (1 << CP0TCSt_A) | |
| + (0x3 << CP0TCSt_TKSU) | (1 << CP0TCSt_IXMT) | |
| + (0xff << CP0TCSt_TASID), |
| + .CP1_fcr0 = (1 << FCR0_F64) | (1 << FCR0_L) | (1 << FCR0_W) | |
| + (1 << FCR0_D) | (1 << FCR0_S) | (0x95 << FCR0_PRID), |
| + .CP1_fcr31 = 0, |
| + .CP1_fcr31_rw_bitmask = 0xFF83FFFF, |
| + .CP0_SRSCtl = (0xf << CP0SRSCtl_HSS), |
| + .CP0_SRSConf0_rw_bitmask = 0x3fffffff, |
| + .CP0_SRSConf0 = (1U << CP0SRSC0_M) | (0x3fe << CP0SRSC0_SRS3) | |
| + (0x3fe << CP0SRSC0_SRS2) | (0x3fe << CP0SRSC0_SRS1), |
| + .CP0_SRSConf1_rw_bitmask = 0x3fffffff, |
| + .CP0_SRSConf1 = (1U << CP0SRSC1_M) | (0x3fe << CP0SRSC1_SRS6) | |
| + (0x3fe << CP0SRSC1_SRS5) | (0x3fe << CP0SRSC1_SRS4), |
| + .CP0_SRSConf2_rw_bitmask = 0x3fffffff, |
| + .CP0_SRSConf2 = (1U << CP0SRSC2_M) | (0x3fe << CP0SRSC2_SRS9) | |
| + (0x3fe << CP0SRSC2_SRS8) | (0x3fe << CP0SRSC2_SRS7), |
| + .CP0_SRSConf3_rw_bitmask = 0x3fffffff, |
| + .CP0_SRSConf3 = (1U << CP0SRSC3_M) | (0x3fe << CP0SRSC3_SRS12) | |
| + (0x3fe << CP0SRSC3_SRS11) | (0x3fe << CP0SRSC3_SRS10), |
| + .CP0_SRSConf4_rw_bitmask = 0x3fffffff, |
| + .CP0_SRSConf4 = (0x3fe << CP0SRSC4_SRS15) | |
| + (0x3fe << CP0SRSC4_SRS14) | (0x3fe << CP0SRSC4_SRS13), |
| + .SEGBITS = 32, |
| + .PABITS = 32, |
| + .insn_flags = CPU_MIPS32R2 | ASE_MIPS16 | ASE_DSP | ASE_MT, |
| + .mmu_type = MMU_TYPE_R4000, |
| + }, |
| { |
| .name = "74Kf", |
| .CP0_PRid = 0x00019700, |
| -- |
| 2.14.5 |
| |