Andrew Geissler | af5e4ef | 2020-10-16 10:22:50 -0500 | [diff] [blame] | 1 | From b3fcc7d96523ad8e3ea28c09d495ef08529d01ce Mon Sep 17 00:00:00 2001 |
| 2 | From: Victor Kamensky <kamensky@cisco.com> |
| 3 | Date: Wed, 7 Oct 2020 10:19:42 -0700 |
| 4 | Subject: [PATCH] mips: add 34Kf-64tlb fictitious cpu type like 34Kf but with |
| 5 | 64 TLBs |
| 6 | |
| 7 | In Yocto Project CI runs it was observed that test run |
| 8 | of 32 bit mips image takes almost twice longer than 64 bit |
| 9 | mips image with the same logical load and CI execution |
| 10 | hits timeout. |
| 11 | |
| 12 | See https://bugzilla.yoctoproject.org/show_bug.cgi?id=13992 |
| 13 | |
| 14 | Yocto project uses 34Kf cpu type to run 32 bit mips image, |
| 15 | and MIPS64R2-generic cpu type to run 64 bit mips64 image. |
| 16 | |
| 17 | Upon qemu behavior differences investigation between mips |
| 18 | and mips64 two prominent observations came up: under |
| 19 | logically similar load (same definition and configuration |
| 20 | of user-land image) in case of mips get_physical_address |
| 21 | function is called almost twice more often, meaning |
| 22 | twice more memory accesses involved in this case. Also |
| 23 | number of tlbwr instruction executed (r4k_helper_tlbwr |
| 24 | qemu function) almost 16 time bigger in mips case than in |
| 25 | mips64. |
| 26 | |
| 27 | It turns out that 34Kf cpu has 16 TLBs, but in case of |
| 28 | MIPS64R2-generic it is 64 TLBs. So that explains why |
| 29 | some many more tlbwr had to be execute by kernel TLB refill |
| 30 | handler in case of 32 bit misp. |
| 31 | |
| 32 | The idea of the fix is to come up with new 34Kf-64tlb fictitious |
| 33 | cpu type, that would behave exactly as 34Kf but it would |
| 34 | contain 64 TLBs to reduce TLB trashing. After all, adding |
| 35 | more TLBs to soft mmu is easy. |
| 36 | |
| 37 | Experiment with some significant non-trvial load in Yocto |
| 38 | environment by running do_testimage load shows that 34Kf-64tlb |
| 39 | cpu performs 40% or so better than original 34Kf cpu wrt test |
| 40 | execution real time. |
| 41 | |
| 42 | It is not ideal to have cpu type that does not exist in the |
| 43 | wild but given performance gains it seems to be justified. |
| 44 | |
| 45 | Signed-off-by: Victor Kamensky <kamensky@cisco.com> |
| 46 | --- |
| 47 | target/mips/translate_init.inc.c | 55 ++++++++++++++++++++++++++++++++++++++++ |
| 48 | 1 file changed, 55 insertions(+) |
| 49 | |
| 50 | diff --git a/target/mips/translate_init.inc.c b/target/mips/translate_init.inc.c |
| 51 | index 637caccd89..b73ab48231 100644 |
| 52 | --- a/target/mips/translate_init.inc.c |
| 53 | +++ b/target/mips/translate_init.inc.c |
| 54 | @@ -297,6 +297,61 @@ const mips_def_t mips_defs[] = |
| 55 | .insn_flags = CPU_MIPS32R2 | ASE_MIPS16 | ASE_DSP | ASE_MT, |
| 56 | .mmu_type = MMU_TYPE_R4000, |
| 57 | }, |
| 58 | + /* |
| 59 | + * Verbatim copy of "34Kf" cpu, only bumped up number of TLB entries |
| 60 | + * from 16 to 64 (see CP0_Config0 value at CP0C1_MMU bits) to improve |
| 61 | + * performance by reducing number of TLB refill exceptions and |
| 62 | + * eliminating need to run all corresponding TLB refill handling |
| 63 | + * instructions. |
| 64 | + */ |
| 65 | + { |
| 66 | + .name = "34Kf-64tlb", |
| 67 | + .CP0_PRid = 0x00019500, |
| 68 | + .CP0_Config0 = MIPS_CONFIG0 | (0x1 << CP0C0_AR) | |
| 69 | + (MMU_TYPE_R4000 << CP0C0_MT), |
| 70 | + .CP0_Config1 = MIPS_CONFIG1 | (1 << CP0C1_FP) | (63 << CP0C1_MMU) | |
| 71 | + (0 << CP0C1_IS) | (3 << CP0C1_IL) | (1 << CP0C1_IA) | |
| 72 | + (0 << CP0C1_DS) | (3 << CP0C1_DL) | (1 << CP0C1_DA) | |
| 73 | + (1 << CP0C1_CA), |
| 74 | + .CP0_Config2 = MIPS_CONFIG2, |
| 75 | + .CP0_Config3 = MIPS_CONFIG3 | (1 << CP0C3_VInt) | (1 << CP0C3_MT) | |
| 76 | + (1 << CP0C3_DSPP), |
| 77 | + .CP0_LLAddr_rw_bitmask = 0, |
| 78 | + .CP0_LLAddr_shift = 0, |
| 79 | + .SYNCI_Step = 32, |
| 80 | + .CCRes = 2, |
| 81 | + .CP0_Status_rw_bitmask = 0x3778FF1F, |
| 82 | + .CP0_TCStatus_rw_bitmask = (0 << CP0TCSt_TCU3) | (0 << CP0TCSt_TCU2) | |
| 83 | + (1 << CP0TCSt_TCU1) | (1 << CP0TCSt_TCU0) | |
| 84 | + (0 << CP0TCSt_TMX) | (1 << CP0TCSt_DT) | |
| 85 | + (1 << CP0TCSt_DA) | (1 << CP0TCSt_A) | |
| 86 | + (0x3 << CP0TCSt_TKSU) | (1 << CP0TCSt_IXMT) | |
| 87 | + (0xff << CP0TCSt_TASID), |
| 88 | + .CP1_fcr0 = (1 << FCR0_F64) | (1 << FCR0_L) | (1 << FCR0_W) | |
| 89 | + (1 << FCR0_D) | (1 << FCR0_S) | (0x95 << FCR0_PRID), |
| 90 | + .CP1_fcr31 = 0, |
| 91 | + .CP1_fcr31_rw_bitmask = 0xFF83FFFF, |
| 92 | + .CP0_SRSCtl = (0xf << CP0SRSCtl_HSS), |
| 93 | + .CP0_SRSConf0_rw_bitmask = 0x3fffffff, |
| 94 | + .CP0_SRSConf0 = (1U << CP0SRSC0_M) | (0x3fe << CP0SRSC0_SRS3) | |
| 95 | + (0x3fe << CP0SRSC0_SRS2) | (0x3fe << CP0SRSC0_SRS1), |
| 96 | + .CP0_SRSConf1_rw_bitmask = 0x3fffffff, |
| 97 | + .CP0_SRSConf1 = (1U << CP0SRSC1_M) | (0x3fe << CP0SRSC1_SRS6) | |
| 98 | + (0x3fe << CP0SRSC1_SRS5) | (0x3fe << CP0SRSC1_SRS4), |
| 99 | + .CP0_SRSConf2_rw_bitmask = 0x3fffffff, |
| 100 | + .CP0_SRSConf2 = (1U << CP0SRSC2_M) | (0x3fe << CP0SRSC2_SRS9) | |
| 101 | + (0x3fe << CP0SRSC2_SRS8) | (0x3fe << CP0SRSC2_SRS7), |
| 102 | + .CP0_SRSConf3_rw_bitmask = 0x3fffffff, |
| 103 | + .CP0_SRSConf3 = (1U << CP0SRSC3_M) | (0x3fe << CP0SRSC3_SRS12) | |
| 104 | + (0x3fe << CP0SRSC3_SRS11) | (0x3fe << CP0SRSC3_SRS10), |
| 105 | + .CP0_SRSConf4_rw_bitmask = 0x3fffffff, |
| 106 | + .CP0_SRSConf4 = (0x3fe << CP0SRSC4_SRS15) | |
| 107 | + (0x3fe << CP0SRSC4_SRS14) | (0x3fe << CP0SRSC4_SRS13), |
| 108 | + .SEGBITS = 32, |
| 109 | + .PABITS = 32, |
| 110 | + .insn_flags = CPU_MIPS32R2 | ASE_MIPS16 | ASE_DSP | ASE_MT, |
| 111 | + .mmu_type = MMU_TYPE_R4000, |
| 112 | + }, |
| 113 | { |
| 114 | .name = "74Kf", |
| 115 | .CP0_PRid = 0x00019700, |
| 116 | -- |
| 117 | 2.14.5 |
| 118 | |