1169689Skan;; ARM 1026EJ-S Pipeline Description 2169689Skan;; Copyright (C) 2003 Free Software Foundation, Inc. 3169689Skan;; Written by CodeSourcery, LLC. 4169689Skan;; 5169689Skan;; This file is part of GCC. 6169689Skan;; 7169689Skan;; GCC is free software; you can redistribute it and/or modify it 8169689Skan;; under the terms of the GNU General Public License as published by 9169689Skan;; the Free Software Foundation; either version 2, or (at your option) 10169689Skan;; any later version. 11169689Skan;; 12169689Skan;; GCC is distributed in the hope that it will be useful, but 13169689Skan;; WITHOUT ANY WARRANTY; without even the implied warranty of 14169689Skan;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15169689Skan;; General Public License for more details. 16169689Skan;; 17169689Skan;; You should have received a copy of the GNU General Public License 18169689Skan;; along with GCC; see the file COPYING. If not, write to the Free 19169689Skan;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20169689Skan;; 02110-1301, USA. */ 21169689Skan 22169689Skan;; These descriptions are based on the information contained in the 23169689Skan;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM 24169689Skan;; Limited. 25169689Skan;; 26169689Skan 27169689Skan;; This automaton provides a pipeline description for the ARM 28169689Skan;; 1026EJ-S core. 29169689Skan;; 30169689Skan;; The model given here assumes that the condition for all conditional 31169689Skan;; instructions is "true", i.e., that all of the instructions are 32169689Skan;; actually executed. 33169689Skan 34169689Skan(define_automaton "arm1026ejs") 35169689Skan 36169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37169689Skan;; Pipelines 38169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39169689Skan 40169689Skan;; There are two pipelines: 41169689Skan;; 42169689Skan;; - An Arithmetic Logic Unit (ALU) pipeline. 43169689Skan;; 44169689Skan;; The ALU pipeline has fetch, issue, decode, execute, memory, and 45169689Skan;; write stages. We only need to model the execute, memory and write 46169689Skan;; stages. 47169689Skan;; 48169689Skan;; - A Load-Store Unit (LSU) pipeline. 49169689Skan;; 50169689Skan;; The LSU pipeline has decode, execute, memory, and write stages. 51169689Skan;; We only model the execute, memory and write stages. 52169689Skan 53169689Skan(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") 54169689Skan(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") 55169689Skan 56169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 57169689Skan;; ALU Instructions 58169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59169689Skan 60169689Skan;; ALU instructions require three cycles to execute, and use the ALU 61169689Skan;; pipeline in each of the three stages. The results are available 62169689Skan;; after the execute stage stage has finished. 63169689Skan;; 64169689Skan;; If the destination register is the PC, the pipelines are stalled 65169689Skan;; for several cycles. That case is not modeled here. 66169689Skan 67169689Skan;; ALU operations with no shifted operand 68169689Skan(define_insn_reservation "alu_op" 1 69169689Skan (and (eq_attr "tune" "arm1026ejs") 70169689Skan (eq_attr "type" "alu")) 71169689Skan "a_e,a_m,a_w") 72169689Skan 73169689Skan;; ALU operations with a shift-by-constant operand 74169689Skan(define_insn_reservation "alu_shift_op" 1 75169689Skan (and (eq_attr "tune" "arm1026ejs") 76169689Skan (eq_attr "type" "alu_shift")) 77169689Skan "a_e,a_m,a_w") 78169689Skan 79169689Skan;; ALU operations with a shift-by-register operand 80169689Skan;; These really stall in the decoder, in order to read 81169689Skan;; the shift value in a second cycle. Pretend we take two cycles in 82169689Skan;; the execute stage. 83169689Skan(define_insn_reservation "alu_shift_reg_op" 2 84169689Skan (and (eq_attr "tune" "arm1026ejs") 85169689Skan (eq_attr "type" "alu_shift_reg")) 86169689Skan "a_e*2,a_m,a_w") 87169689Skan 88169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 89169689Skan;; Multiplication Instructions 90169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 91169689Skan 92169689Skan;; Multiplication instructions loop in the execute stage until the 93169689Skan;; instruction has been passed through the multiplier array enough 94169689Skan;; times. 95169689Skan 96169689Skan;; The result of the "smul" and "smulw" instructions is not available 97169689Skan;; until after the memory stage. 98169689Skan(define_insn_reservation "mult1" 2 99169689Skan (and (eq_attr "tune" "arm1026ejs") 100169689Skan (eq_attr "insn" "smulxy,smulwy")) 101169689Skan "a_e,a_m,a_w") 102169689Skan 103169689Skan;; The "smlaxy" and "smlawx" instructions require two iterations through 104169689Skan;; the execute stage; the result is available immediately following 105169689Skan;; the execute stage. 106169689Skan(define_insn_reservation "mult2" 2 107169689Skan (and (eq_attr "tune" "arm1026ejs") 108169689Skan (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 109169689Skan "a_e*2,a_m,a_w") 110169689Skan 111169689Skan;; The "smlalxy", "mul", and "mla" instructions require two iterations 112169689Skan;; through the execute stage; the result is not available until after 113169689Skan;; the memory stage. 114169689Skan(define_insn_reservation "mult3" 3 115169689Skan (and (eq_attr "tune" "arm1026ejs") 116169689Skan (eq_attr "insn" "smlalxy,mul,mla")) 117169689Skan "a_e*2,a_m,a_w") 118169689Skan 119169689Skan;; The "muls" and "mlas" instructions loop in the execute stage for 120169689Skan;; four iterations in order to set the flags. The value result is 121169689Skan;; available after three iterations. 122169689Skan(define_insn_reservation "mult4" 3 123169689Skan (and (eq_attr "tune" "arm1026ejs") 124169689Skan (eq_attr "insn" "muls,mlas")) 125169689Skan "a_e*4,a_m,a_w") 126169689Skan 127169689Skan;; Long multiply instructions that produce two registers of 128169689Skan;; output (such as umull) make their results available in two cycles; 129169689Skan;; the least significant word is available before the most significant 130169689Skan;; word. That fact is not modeled; instead, the instructions are 131169689Skan;; described.as if the entire result was available at the end of the 132169689Skan;; cycle in which both words are available. 133169689Skan 134169689Skan;; The "umull", "umlal", "smull", and "smlal" instructions all take 135169689Skan;; three iterations through the execute cycle, and make their results 136169689Skan;; available after the memory cycle. 137169689Skan(define_insn_reservation "mult5" 4 138169689Skan (and (eq_attr "tune" "arm1026ejs") 139169689Skan (eq_attr "insn" "umull,umlal,smull,smlal")) 140169689Skan "a_e*3,a_m,a_w") 141169689Skan 142169689Skan;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 143169689Skan;; the execute stage for five iterations in order to set the flags. 144169689Skan;; The value result is available after four iterations. 145169689Skan(define_insn_reservation "mult6" 4 146169689Skan (and (eq_attr "tune" "arm1026ejs") 147169689Skan (eq_attr "insn" "umulls,umlals,smulls,smlals")) 148169689Skan "a_e*5,a_m,a_w") 149169689Skan 150169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 151169689Skan;; Load/Store Instructions 152169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153169689Skan 154169689Skan;; The models for load/store instructions do not accurately describe 155169689Skan;; the difference between operations with a base register writeback 156169689Skan;; (such as "ldm!"). These models assume that all memory references 157169689Skan;; hit in dcache. 158169689Skan 159169689Skan;; LSU instructions require six cycles to execute. They use the ALU 160169689Skan;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 161169689Skan;; three through six. 162169689Skan;; Loads and stores which use a scaled register offset or scaled 163169689Skan;; register pre-indexed addressing mode take three cycles EXCEPT for 164169689Skan;; those that are base + offset with LSL of 0 or 2, or base - offset 165169689Skan;; with LSL of zero. The remainder take 1 cycle to execute. 166169689Skan;; For 4byte loads there is a bypass from the load stage 167169689Skan 168169689Skan(define_insn_reservation "load1_op" 2 169169689Skan (and (eq_attr "tune" "arm1026ejs") 170169689Skan (eq_attr "type" "load_byte,load1")) 171169689Skan "a_e+l_e,l_m,a_w+l_w") 172169689Skan 173169689Skan(define_insn_reservation "store1_op" 0 174169689Skan (and (eq_attr "tune" "arm1026ejs") 175169689Skan (eq_attr "type" "store1")) 176169689Skan "a_e+l_e,l_m,a_w+l_w") 177169689Skan 178169689Skan;; A load's result can be stored by an immediately following store 179169689Skan(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") 180169689Skan 181169689Skan;; On a LDM/STM operation, the LSU pipeline iterates until all of the 182169689Skan;; registers have been processed. 183169689Skan;; 184169689Skan;; The time it takes to load the data depends on whether or not the 185169689Skan;; base address is 64-bit aligned; if it is not, an additional cycle 186169689Skan;; is required. This model assumes that the address is always 64-bit 187169689Skan;; aligned. Because the processor can load two registers per cycle, 188169689Skan;; that assumption means that we use the same instruction reservations 189169689Skan;; for loading 2k and 2k - 1 registers. 190169689Skan;; 191169689Skan;; The ALU pipeline is stalled until the completion of the last memory 192169689Skan;; stage in the LSU pipeline. That is modeled by keeping the ALU 193169689Skan;; execute stage busy until that point. 194169689Skan;; 195169689Skan;; As with ALU operations, if one of the destination registers is the 196169689Skan;; PC, there are additional stalls; that is not modeled. 197169689Skan 198169689Skan(define_insn_reservation "load2_op" 2 199169689Skan (and (eq_attr "tune" "arm1026ejs") 200169689Skan (eq_attr "type" "load2")) 201169689Skan "a_e+l_e,l_m,a_w+l_w") 202169689Skan 203169689Skan(define_insn_reservation "store2_op" 0 204169689Skan (and (eq_attr "tune" "arm1026ejs") 205169689Skan (eq_attr "type" "store2")) 206169689Skan "a_e+l_e,l_m,a_w+l_w") 207169689Skan 208169689Skan(define_insn_reservation "load34_op" 3 209169689Skan (and (eq_attr "tune" "arm1026ejs") 210169689Skan (eq_attr "type" "load3,load4")) 211169689Skan "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 212169689Skan 213169689Skan(define_insn_reservation "store34_op" 0 214169689Skan (and (eq_attr "tune" "arm1026ejs") 215169689Skan (eq_attr "type" "store3,store4")) 216169689Skan "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 217169689Skan 218169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 219169689Skan;; Branch and Call Instructions 220169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 221169689Skan 222169689Skan;; Branch instructions are difficult to model accurately. The ARM 223169689Skan;; core can predict most branches. If the branch is predicted 224169689Skan;; correctly, and predicted early enough, the branch can be completely 225169689Skan;; eliminated from the instruction stream. Some branches can 226169689Skan;; therefore appear to require zero cycles to execute. We assume that 227169689Skan;; all branches are predicted correctly, and that the latency is 228169689Skan;; therefore the minimum value. 229169689Skan 230169689Skan(define_insn_reservation "branch_op" 0 231169689Skan (and (eq_attr "tune" "arm1026ejs") 232169689Skan (eq_attr "type" "branch")) 233169689Skan "nothing") 234169689Skan 235169689Skan;; The latency for a call is not predictable. Therefore, we use 32 as 236169689Skan;; roughly equivalent to positive infinity. 237169689Skan 238169689Skan(define_insn_reservation "call_op" 32 239169689Skan (and (eq_attr "tune" "arm1026ejs") 240169689Skan (eq_attr "type" "call")) 241169689Skan "nothing") 242