Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, lsl, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, x7, lsl #2]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10054033166001383217192510001000100015208039839822132561000100020003987711100110001000100043103803810386139447311611395141071000395399399399399
10043983044001383217192510001000100015267139839822132561000100020003997911100110001000100043103803910386139447321611486141471000404399401399395
10043983044101383217192510001000100015274039839822132561000100020003987711100110001000100043103803810396138447311611395141471000399395399401399
10044023044001379217192510001000100015267039839821732571000100020003997911100110001000100043103803910386139447311611395141471000399401400396399
10043983045001383217192510001000100015274139840022132561000100020003987711100110001000100044103803910386139447311611395141471000399399399399399
10043983044001383219192510001000100015274039839822132561000100020003947711100110001000100043103803810386139447311622396141471000400402395399399
10043983053001383217192510001000100015274039839922132521000100020003987711100110001000100043103803810386139447311611395101471000399399399399399
10043983044001383217192510001000100015274139839922132561000100020003987711100110001000100044103803810386139447311611395141471000399399395399399
10043943044001379217192510001000100015274039939822132551000100020003987711100110001000100044103803810386139447511611395101071000399405399412399
10044012044101383217162510001000100015187039839822132521000100020003997911100110001000100044103803810386138437311611395141471000399400400399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, x7, lsl #2]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)030e0f1e1f22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005352500100070035697355970625401043010310001301001000061600533420620496697007005070050646463649384010030200100006020020000700503511402011009910010000301001000001001000011000000010000110261027122698133000096910000301007005170048700487005170051
402047005052500100070035697815970925401043010310001301001000061600533420620496697007005070050646493649534010030200100006020020000700353511402011009910010000301001000001001000011000000010000110261027123701303000306910000301007004870048700367005170051
402047005052500100070035697645970925401043010310001301001000061600533420620496695507005070050646463649534010030200100006020020000700473511402011009910010000301001000001001000011000000010000110261027122698103000399910000301007005170051700517005170048
40204700505250010007003269781597092540104301001000130100100006160153342062049669700700507004764646364950401003020010000602002000070047351140201100991001000030100100000100100001100002603010000110261027122698133000360010000301007003670051700487004870051
402047005052400100070032697815970625401043010310001301001000061600533420620496696707004770050646313649504010030200100006020020000700903511402011009910010000301001000001001000001000000010000110261027122698133000366910000301007005170048700567005170051
402047004752500100070032697435969525401043010310001301001000061601533420620496697007003570047646433649534010030200100006020020000700473511402011009910010000301001000001001000011000000010000110261027122698103000360610000301007004870048700487005170036
402047003552510100070020697645970625401043010310001301001000061600533420620496697007004770047646433649534010030200100006020020000700473511402011009910010000301001000001001000011000000010000110261027122698103000099910000301007004870048700487005170051
402047003552410100070020697815970625401043010310001301001000061601533420620496695507004770047646463649504010030200100006020020000700473511402011009910010000301001000001001000011000000010000100261027122698103000366910000301007005170048700367004870048
402047005052400101070032697355970925401043010310001301001000061617533420620496696707005070047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000010000010261027122698103000396610000301007005170048700487004870048
402047004752511100070032697355970625401043010310001301001000061601533420620496696707004770047646313649504010030200100006020020000700473511402011009910010000301001000001001000011000000010000110261027122698133000366910000301007004870051700517005170051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352511011021017002669777597012540018300131000230010100006170363342494149669610700567004164674036496640010300201000060020200007004135114002110910100003001010000010100013110003001100001111000252047124698043000300910000300107005770057700427004270057
400247004152511101021017004169702597122540018300161000230161100006170093342494049669610700567005664671036497840010300201000060020200007005635114002110910100003001010000010100012110002011100001111100252027134698043000600010000300107005770115700427006270057
400247005352410100020007004169780597012540014300131000230010100006170363341769149669730700567005364674036496640010300201000060020200007005335114002110910100003001010000010100011010001011100000111100252027142698163000666010000300107005770042700577005770057
400247005352511100020007002669702597022540014300161000230010100006170363342350049669760700567005664674036496640010300201000060020200007005635114002110910100003001010000010100012110003021100001101200252037122698193000699010000300107005770057700427005770057
400247004152510100020007004169780597152540018300161000230010100006170363341769149669610700567005664730036497840010300201000060020200007004135114002110910100003001010000010100022110002001100001101000252027142698043000696910000300107014070057700577005770057
400247005652411000021007002669780597012540018300161000230010100006170363342350049669760700567005364659036497840010300201000060020200007005335114002110910100003001010000010100023110001011100001111100252027134698163000600910000300107005770057700427005770057
400247005652511101010007004169780597012540018300161000230010100006170363341769049669610700567005364674036498140010300201000060020200007005635114002110910100003001010000010100031010002001100001111100252037153698193000696910000300107004270054700427005470057
400247005652511100021017002669702597472540018300161000230010100006170363341769049669760700567005664674036496640010300201000060020200007005635114002110910100003001010000010100011110002021100000111000252047122698043000360910000300107005770057700577004270042
400247005352511011020007002669702597152540018300161000130010100006170093342494049669760700567004164674036496640010300201000060020200007005335114002110910100003001010000010100031110003031100000111000252037122698043000609010000300107004270042700547004270054
400247005652511101011007004169702597122540014300161000230010100006170363342494149669760700537005364674036498140010300201000060020200007005835114002110910100003001010000010100011110001011100001111100255527122698193000690910000300107005770042700577005770042

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, x7, lsl #2]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005352511111200170038697845970125401083010610004301241000061603233423501496697370053700416464936494440100302001000060200200007005335114020110099100100003010010000010010002211000283211000011111261137132698163000666610000301007005470054700547005470138
402047005352511100210170038697845971525401083010610002301001000061603233423501496697370053700536464936495640100302001000060200200007005335114020110099100100003010010000010010002111000177111000011011261127133698163000666610000301007005470057700547005470149
402047005352511100110070038697875971225401083010310002301001000061603233423501496697370053700536464936495640100302001000060200200007005335114020110099100100003010010000010010003311000286241000011011261127152698163000666610000301007005470054700547005470130
4020470060525100002101700386978459712254010830106100023010010000616078334235014966973700537005364649216501840100302001000060200200007005335114020110099100100003010010000010010002111000182211000001110261127123698163000666610000301007005470054700547005470143
402047004152510000210070038697845971225401083010610002301001000061603233423501496697370053700536464936495640100302001000060200200007005335114020110099100100003010010000010010001211000189111000011110261137133698163000666610000301007005470054700547005470147
40204700535251000021017003869784597122540108301031000230100100006160323342350149669767005370053646493649564010030200100006020020000700533521402011009910010000301001000001001001041100030141000011111261139434700823005666610000301007036270456704797047970650
402047005752411200200170038697875971225401083010610002301001000061603233423501496697370053700536464936495641097303581000060200200007005335114020110099100100003010010000010010001211000100191000011110261137123698193000666610000301007005770054700547004270118
402047005352410000110170038697845971225401083010310002301001000061603233417691496697370053700536464936495640100302001000060200200007004135114020110099100100003010010000010010002211000181071000011111261137132698193000666010000301007005470054700547005470123
402047005352510100210170038697025971225401083010610002301001000061603233423501496697370053700416464936495640100302001000060200200007005335114020110099100100003010010000010010001111000294111000011010261127123698163000660610000301007005470144700557005470099
402047005352411100210170038697025971225401083010610002301001000061603233423500496697370056700536464936495640100302001000060200200007005335114020110099100100003010010000010010001111000256211000011011261127133698043000666610000301007005470054700547005470133

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)0e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525000010070035697025970125400143001010001300101000061706833422061496697070047700476465336497240010300201000060020200007003535114002110910100003001010000010100000100000010000100252067135698133000396010000300107003670036700547013570049
4002470047525000060070020697775970925400103001310004300101000061710433421101496697070050700506466836497540010300201000060020200007005035114002110910100003001010000010100001100001010000100252057146698133000366610000300107004870048700487016970036
4002470050524000010070020697775973225400143001010000300101000061698233422061496695570050700556466836496040010300201000060020200007005035114002110910100003001010000010100001100000010000000252057155698103000099910000300107005170036701287005470052
400247005052400001900070032697775972625400103001310001300101000061695233422061496695570035700506466536497540010300201000060020200007005035114002110910100003001010000010100000100000010000010252068853697983000066010000300107004870048700427012070051
4002470037524000011070038697775977525400103001010001300101000061695233422061496695570050700356466536498040010300201000060020200007003535114002110910100003001010000010100000100000010000110252067153698103000300610000300107003670048700547014170036
4002470047524000010070020697775974125400143001310001300101000061695233420621496697070035700506465336496040010300201000060020200007003535114002110910100003001010000010100001100000010000000252037154698133000390010000300107003670051700547014470048
4002470035524000010070035697775976725400103001310000300101000061698233414701496695570035700476466536496040010300201000060020200007005077114002110910100003001010000010100001100000010000010252057155698133000396010000300107003670036700487007970051
40024700475250000101070020697775976025400143001310001300101000061695233422061496695570035700476466836497540010300201000060020200007003535114002110910100003001010000010100001100000010000010252037147697983000066910000300107004870051700547015270054
400247005052400001771070035697775971425400143001010001300101000061706833414701496695970050700356466836497240010300201000060020200007004735114002110910100003001010000010100001100001010001110252057165698133000090910000300107005170048700427010070048
4002470035525001011070032697775973825400143001310001300101000061698233414701496695570050700506466836497540010300201000060020200007003535114002110910100003001010000010100000100000010000110252057164698103000300910000300107003670036700547013370036

Test 4: throughput

Count: 8

Code:

  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  ldr w0, [x6, x7, lsl #2]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526734200101006500326718221181625801001008000010080015500116659604923627267282672816656616659801142008002420016004826728561180201100991008000010080000110080000039800420104680000315942191111511801600267251104800001002671226734267332672526725
802042672820000010411002671321801825801001008000010080016500116722404923653267142673316661616667801152008002420016004826714631180201100991008000010080000010080020190800610006280041311901911115118016002671110103800001002673426745267342673526734
8020426714200100106510026699221017258010010080000100800155001169611049236542671426733166616166668011620080024200160048267338211802011009910080000100800000100800192042800601006280042316001901115118016002673010103800001002674026735267342673426734
80204267342001110165103267190018025801001008000010080016500116961104923634267192671416661616685801162008002420016004826714821180201100991008000010080000010080020194280019111628004101600190111511801600267311003800001002673626715267402673426716
80204267332001000065003267182018162580100100800001008001650011668360492363426733267341666161668680116200800242001600482673364118020110099100800001008000001008002121080061001618000031620190111511801600267310100800001002673126734267152671526734
8020426733200110006500326718201802580100100800001008001650011698051492365326715267141666161668580116200800242001600482671482118020110099100800001008000001008002020428006020162800413160421911115118016002673010103800001002673926741267152673526734
8020426714200100006500226718218180258010010080000100800165001169805049236532673326733166426166668011520080024200160048267348211802011009910080000100800000100800202142800600026280041006001901115118016002673010103800001002674426803267362674326734
80204267342001110065002267182180172580100100800001008001650011698050492365326736267151666161668580115200800242001600482673482118020110099100800001008000011008002019428006000063800410119019022251281231126730003800001002673726721267342673426734
8020426714201101006610326718218181625801001008000010080021500115892304923634267142673316651916654801202008003020016006026733821180201100991008000010080000010080019194280190003628004131600191222512912311267121003800001002673926789267372674526734
8020426733200102106500326718218181725801001008000010080020500116673004923635267332673316651916673801212008003020016006026733641180201100991008000010080000010080019210800622002180000316042191222512812311267110103800001002674526734267152671626734

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f181e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267342001110006501032671921818812580010108000010800005011678721492365326733267151667831671480010208000020160000267338211800211091080000108000011080019204208006010062800413160421910502051624267111010380000102683026739268472672426735
80024267332011011006500032671831818232580010108000010800005011677081492365326733267331667931671480010208000020160000267338211800211091080000108000001080021204308006010062800403160421910502021624267301010380000102684226746267812672826734
80024267332001110006401032671921821262580010108000010800005011682491492365326715267341667831671380010208000020160000267348211800211091080000108000001080022204208006000062800413160421910502041644267301010380000102683126745268422674626734
80024267152001000006501032669921818192580010108000010800005011673471492365426733267331667931671480010208000020160000267338211800211091080000108000001080020194208006010065800410160421900502021624267301010380000102683026748267342718526741
80024267392001010006501032671801818282580010108000010800005011702521492365326733267331667831671380010208000020160000267348311800211091080000108000001080021194208006010162800413160421900502021644267301010380000102683526743267942674226734
8002426733200110000650103267182021222580010108000010800005011663111492365326715267151667831671380010208000020160000267348211800211091080000108000001080019194208006001262800413019421910502021666267301010380000102684326740267372674226734
800242673320011000054501032671821818242580010108000010800005011669601492365426733267331667931669580010208000020160000267338211800211091080000108000001080019204208006002065800413163421910502041642267301010380000102673426743267382673426738
80024267342001010006501032671921818202580010108000010800005011677081492365326733267331667831671380010208000020160000267338211800211091080000108000001080020204208006000162800413160421920502041666267301010380000102684326738267372682426734
80024267332001110006501022671921818192580010108000010800005011677081492365326734267341667831671380010208000020160000267148211800211091080000108000001080019194208006110162800423060421900502021624267301010380000102684026740267372673726734
800242673520011100065010326718218181012580010108000010800005011702521492365326734267331667831671380010208000020160000267348211800211091080000108000001080019194208006310162800413160421910502021624267311010380000102682826740267362674026734