1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // This file defines the lifting pass which tries to "lift" Alloc
8 // cells (new/local variables) into SSA registers, replacing loads
9 // with the dominating stored value, eliminating loads and stores, and
10 // inserting φ- and σ-nodes as needed.
12 // Cited papers and resources:
14 // Ron Cytron et al. 1991. Efficiently computing SSA form...
15 // http://doi.acm.org/10.1145/115372.115320
17 // Cooper, Harvey, Kennedy. 2001. A Simple, Fast Dominance Algorithm.
18 // Software Practice and Experience 2001, 4:1-10.
19 // http://www.hipersoft.rice.edu/grads/publications/dom14.pdf
21 // Daniel Berlin, llvmdev mailing list, 2012.
22 // http://lists.cs.uiuc.edu/pipermail/llvmdev/2012-January/046638.html
23 // (Be sure to expand the whole thread.)
25 // C. Scott Ananian. 1997. The static single information form.
27 // Jeremy Singer. 2006. Static program analysis based on virtual register renaming.
29 // TODO(adonovan): opt: there are many optimizations worth evaluating, and
30 // the conventional wisdom for SSA construction is that a simple
31 // algorithm well engineered often beats those of better asymptotic
32 // complexity on all but the most egregious inputs.
34 // Danny Berlin suggests that the Cooper et al. algorithm for
35 // computing the dominance frontier is superior to Cytron et al.
36 // Furthermore he recommends that rather than computing the DF for the
37 // whole function then renaming all alloc cells, it may be cheaper to
38 // compute the DF for each alloc cell separately and throw it away.
40 // Consider exploiting liveness information to avoid creating dead
41 // φ-nodes which we then immediately remove.
43 // Also see many other "TODO: opt" suggestions in the code.
51 // If true, show diagnostic information at each step of lifting.
53 const debugLifting = false
55 // domFrontier maps each block to the set of blocks in its dominance
56 // frontier. The outer slice is conceptually a map keyed by
57 // Block.Index. The inner slice is conceptually a set, possibly
58 // containing duplicates.
60 // TODO(adonovan): opt: measure impact of dups; consider a packed bit
61 // representation, e.g. big.Int, and bitwise parallel operations for
62 // the union step in the Children loop.
64 // domFrontier's methods mutate the slice's elements but not its
65 // length, so their receivers needn't be pointers.
67 type domFrontier [][]*BasicBlock
69 func (df domFrontier) add(u, v *BasicBlock) {
70 df[u.Index] = append(df[u.Index], v)
73 // build builds the dominance frontier df for the dominator tree of
74 // fn, using the algorithm found in A Simple, Fast Dominance
75 // Algorithm, Figure 5.
77 // TODO(adonovan): opt: consider Berlin approach, computing pruned SSA
78 // by pruning the entire IDF computation, rather than merely pruning
79 // the DF -> IDF step.
80 func (df domFrontier) build(fn *Function) {
81 for _, b := range fn.Blocks {
82 if len(b.Preds) >= 2 {
83 for _, p := range b.Preds {
85 for runner != b.dom.idom {
87 runner = runner.dom.idom
94 func buildDomFrontier(fn *Function) domFrontier {
95 df := make(domFrontier, len(fn.Blocks))
100 type postDomFrontier [][]*BasicBlock
102 func (rdf postDomFrontier) add(u, v *BasicBlock) {
103 rdf[u.Index] = append(rdf[u.Index], v)
106 func (rdf postDomFrontier) build(fn *Function) {
107 for _, b := range fn.Blocks {
108 if len(b.Succs) >= 2 {
109 for _, s := range b.Succs {
111 for runner != b.pdom.idom {
113 runner = runner.pdom.idom
120 func buildPostDomFrontier(fn *Function) postDomFrontier {
121 rdf := make(postDomFrontier, len(fn.Blocks))
126 func removeInstr(refs []Instruction, instr Instruction) []Instruction {
128 for _, ref := range refs {
135 for j := i; j != len(refs); j++ {
136 refs[j] = nil // aid GC
141 func clearInstrs(instrs []Instruction) {
142 for i := range instrs {
147 // lift replaces local and new Allocs accessed only with
148 // load/store by IR registers, inserting φ- and σ-nodes where necessary.
149 // The result is a program in pruned SSI form.
152 // - fn has no dead blocks (blockopt has run).
153 // - Def/use info (Operands and Referrers) is up-to-date.
154 // - The dominator tree is up-to-date.
156 func lift(fn *Function) {
157 // TODO(adonovan): opt: lots of little optimizations may be
158 // worthwhile here, especially if they cause us to avoid
159 // buildDomFrontier. For example:
161 // - Alloc never loaded? Eliminate.
162 // - Alloc never stored? Replace all loads with a zero constant.
163 // - Alloc stored once? Replace loads with dominating store;
164 // don't forget that an Alloc is itself an effective store
166 // - Alloc used only within a single block?
167 // Use degenerate algorithm avoiding φ-nodes.
168 // - Consider synergy with scalar replacement of aggregates (SRA).
169 // e.g. *(&x.f) where x is an Alloc.
170 // Perhaps we'd get better results if we generated this as x.f
171 // i.e. Field(x, .f) instead of Load(FieldIndex(x, .f)).
174 // But we will start with the simplest correct code.
176 var rdf postDomFrontier
178 var newPhis newPhiMap
179 var newSigmas newSigmaMap
181 // During this pass we will replace some BasicBlock.Instrs
182 // (allocs, loads and stores) with nil, keeping a count in
183 // BasicBlock.gaps. At the end we will reset Instrs to the
184 // concatenation of all non-dead newPhis and non-nil Instrs
185 // for the block, reusing the original array if space permits.
187 // While we're here, we also eliminate 'rundefers'
188 // instructions in functions that contain no 'defer'
192 // Determine which allocs we can lift and number them densely.
193 // The renaming phase uses this numbering for compact maps.
195 for _, b := range fn.Blocks {
198 for _, instr := range b.Instrs {
199 switch instr := instr.(type) {
201 if !liftable(instr) {
207 df = buildDomFrontier(fn)
208 rdf = buildPostDomFrontier(fn)
209 if len(fn.Blocks) > 2 {
210 closure = transitiveClosure(fn)
212 newPhis = make(newPhiMap, len(fn.Blocks))
213 newSigmas = make(newSigmaMap, len(fn.Blocks))
217 for i, blocks := range df {
220 fmt.Fprintf(os.Stderr, "Dominance frontier of %s:\n", fn)
223 fmt.Fprintf(os.Stderr, "\t%s: %s\n", fn.Blocks[i], blocks)
228 liftAlloc(closure, df, rdf, instr, newPhis, newSigmas)
241 // renaming maps an alloc (keyed by index) to its replacement
242 // value. Initially the renaming contains nil, signifying the
243 // zero constant of the appropriate type; we construct the
244 // Const lazily at most once on each path through the domtree.
245 // TODO(adonovan): opt: cache per-function not per subtree.
246 renaming := make([]Value, numAllocs)
249 rename(fn.Blocks[0], renaming, newPhis, newSigmas)
251 simplifyPhis(newPhis)
253 // Eliminate dead φ- and σ-nodes.
254 markLiveNodes(fn.Blocks, newPhis, newSigmas)
257 // Prepend remaining live φ-nodes to each block and possibly kill rundefers.
258 for _, b := range fn.Blocks {
259 var head []Instruction
261 nps := newPhis[b.Index]
262 head = make([]Instruction, 0, len(nps))
263 for _, pred := range b.Preds {
264 nss := newSigmas[pred.Index]
265 idx := pred.succIndex(b)
266 for _, newSigma := range nss {
267 if sigma := newSigma.sigmas[idx]; sigma != nil && sigma.live {
268 head = append(head, sigma)
270 // we didn't populate referrers before, as most
271 // sigma nodes will be killed
272 if refs := sigma.X.Referrers(); refs != nil {
273 *refs = append(*refs, sigma)
275 } else if sigma != nil {
280 for _, np := range nps {
282 head = append(head, np.phi)
284 for _, edge := range np.phi.Edges {
285 if refs := edge.Referrers(); refs != nil {
286 *refs = removeInstr(*refs, np.phi)
294 rundefersToKill := b.rundefers
300 if j+b.gaps+rundefersToKill == 0 {
301 continue // fast path: no new phis or gaps
304 // We could do straight copies instead of element-wise copies
305 // when both b.gaps and rundefersToKill are zero. However,
306 // that seems to only be the case ~1% of the time, which
307 // doesn't seem worth the extra branch.
309 // Remove dead instructions, add phis and sigmas
310 ns := len(b.Instrs) + j - b.gaps - rundefersToKill
311 if ns <= cap(b.Instrs) {
312 // b.Instrs has enough capacity to store all instructions
314 // OPT(dh): check cap vs the actually required space; if
315 // there is a big enough difference, it may be worth
316 // allocating a new slice, to avoid pinning memory.
317 dst := b.Instrs[:cap(b.Instrs)]
319 for n := len(b.Instrs) - 1; n >= 0; n-- {
325 if _, ok := instr.(*RunDefers); ok {
332 off := i + 1 - len(head)
334 clearInstrs(dst[:off])
339 // not enough space, so allocate a new slice and copy
341 dst := make([]Instruction, ns)
344 for _, instr := range b.Instrs {
349 if _, ok := instr.(*RunDefers); ok {
360 // Remove any fn.Locals that were lifted.
362 for _, l := range fn.Locals {
368 // Nil out fn.Locals[j:] to aid GC.
369 for i := j; i < len(fn.Locals); i++ {
372 fn.Locals = fn.Locals[:j]
375 func hasDirectReferrer(instr Instruction) bool {
376 for _, instr := range *instr.Referrers() {
377 switch instr.(type) {
387 func markLiveNodes(blocks []*BasicBlock, newPhis newPhiMap, newSigmas newSigmaMap) {
388 // Phi and sigma nodes are considered live if a non-phi, non-sigma
389 // node uses them. Once we find a node that is live, we mark all
390 // of its operands as used, too.
391 for _, npList := range newPhis {
392 for _, np := range npList {
394 if !phi.live && hasDirectReferrer(phi) {
399 for _, npList := range newSigmas {
400 for _, np := range npList {
401 for _, sigma := range np.sigmas {
402 if sigma != nil && !sigma.live && hasDirectReferrer(sigma) {
408 // Existing φ-nodes due to && and || operators
409 // are all considered live (see Go issue 19622).
410 for _, b := range blocks {
411 for _, phi := range b.phis() {
412 markLivePhi(phi.(*Phi))
417 func markLivePhi(phi *Phi) {
419 for _, rand := range phi.Edges {
420 switch rand := rand.(type) {
433 func markLiveSigma(sigma *Sigma) {
435 switch rand := sigma.X.(type) {
447 // simplifyPhis replaces trivial phis with non-phi alternatives. Phi
448 // nodes where all edges are identical, or consist of only the phi
449 // itself and one other value, may be replaced with the value.
450 func simplifyPhis(newPhis newPhiMap) {
451 // find all phis that are trivial and can be replaced with a
452 // non-phi value. run until we reach a fixpoint, because replacing
453 // a phi may make other phis trivial.
454 for changed := true; changed; {
456 for _, npList := range newPhis {
457 for _, np := range npList {
459 // we're reusing 'live' to mean 'dead' in the context of simplifyPhis
462 if r, ok := isUselessPhi(np.phi); ok {
463 // useless phi, replace its uses with the
464 // replacement value. the dead phi pass will clean
465 // up the phi afterwards.
466 replaceAll(np.phi, r)
474 for _, npList := range newPhis {
475 for _, np := range npList {
481 type BlockSet struct {
487 func NewBlockSet(size int) *BlockSet {
488 return &BlockSet{values: make([]bool, size)}
491 func (s *BlockSet) Set(s2 *BlockSet) {
492 copy(s.values, s2.values)
494 for _, v := range s.values {
501 func (s *BlockSet) Num() int {
505 func (s *BlockSet) Has(b *BasicBlock) bool {
506 if b.Index >= len(s.values) {
509 return s.values[b.Index]
512 // add adds b to the set and returns true if the set changed.
513 func (s *BlockSet) Add(b *BasicBlock) bool {
514 if s.values[b.Index] {
518 s.values[b.Index] = true
524 func (s *BlockSet) Clear() {
525 for j := range s.values {
531 // take removes an arbitrary element from a set s and
532 // returns its index, or returns -1 if empty.
533 func (s *BlockSet) Take() int {
535 for i := s.idx; i < len(s.values); i++ {
545 for i := 0; i < s.idx; i++ {
557 type closure struct {
559 reachables []interval
567 lengthBits = 32 - numBits - 1
568 lengthMask = (1<<lengthBits - 1) << numBits
569 numMask = 1<<numBits - 1
572 func (c closure) has(s, v *BasicBlock) bool {
573 idx := uint32(v.Index)
574 if idx == 1 || s.Dominates(v) {
577 r := c.reachable(s.Index)
578 for i := 0; i < len(r); i++ {
580 var start, end uint32
581 if inv&flagMask == 0 {
583 start = uint32(inv & numMask)
584 end = start + uint32(inv&lengthMask)>>numBits
588 start = uint32(inv & numMask)
591 if idx >= start && idx <= end {
598 func (c closure) reachable(id int) []interval {
599 return c.reachables[c.span[id]:c.span[id+1]]
602 func (c closure) walk(current *BasicBlock, b *BasicBlock, visited []bool) {
603 visited[b.Index] = true
604 for _, succ := range b.Succs {
605 if visited[succ.Index] {
608 visited[succ.Index] = true
609 c.walk(current, succ, visited)
613 func transitiveClosure(fn *Function) *closure {
614 reachable := make([]bool, len(fn.Blocks))
616 c.span = make([]uint32, len(fn.Blocks)+1)
618 addInterval := func(start, end uint32) {
619 if l := end - start; l <= 1<<lengthBits-1 {
620 n := interval(l<<numBits | start)
621 c.reachables = append(c.reachables, n)
623 n1 := interval(1<<31 | start)
625 c.reachables = append(c.reachables, n1, n2)
629 for i, b := range fn.Blocks[1:] {
630 for i := range reachable {
634 c.walk(b, b, reachable)
636 for id, isReachable := range reachable {
638 if start != ^uint32(0) {
639 end := uint32(id) - 1
640 addInterval(start, end)
644 } else if start == ^uint32(0) {
648 if start != ^uint32(0) {
649 addInterval(start, uint32(len(reachable))-1)
652 c.span[i+2] = uint32(len(c.reachables))
658 // newPhi is a pair of a newly introduced φ-node and the lifted Alloc
665 type newSigma struct {
670 // newPhiMap records for each basic block, the set of newPhis that
671 // must be prepended to the block.
672 type newPhiMap [][]newPhi
673 type newSigmaMap [][]newSigma
675 func liftable(alloc *Alloc) bool {
676 // Don't lift aggregates into registers, because we don't have
677 // a way to express their zero-constants.
678 switch deref(alloc.Type()).Underlying().(type) {
679 case *types.Array, *types.Struct:
684 // Don't lift named return values in functions that defer
685 // calls that may recover from panic.
687 for _, nr := range fn.namedResults {
694 for _, instr := range *alloc.Referrers() {
695 switch instr := instr.(type) {
697 if instr.Val == alloc {
698 return false // address used as value
700 if instr.Addr != alloc {
701 panic("Alloc.Referrers is inconsistent")
704 if instr.X != alloc {
705 panic("Alloc.Referrers is inconsistent")
718 // liftAlloc determines whether alloc can be lifted into registers,
719 // and if so, it populates newPhis with all the φ-nodes it may require
721 func liftAlloc(closure *closure, df domFrontier, rdf postDomFrontier, alloc *Alloc, newPhis newPhiMap, newSigmas newSigmaMap) {
724 defblocks := fn.blockset(0)
725 useblocks := fn.blockset(1)
726 Aphi := fn.blockset(2)
727 Asigma := fn.blockset(3)
730 // Compute defblocks, the set of blocks containing a
731 // definition of the alloc cell.
732 for _, instr := range *alloc.Referrers() {
733 // Bail out if we discover the alloc is not liftable;
734 // the only operations permitted to use the alloc are
735 // loads/stores into the cell, and DebugRef.
736 switch instr := instr.(type) {
738 defblocks.Add(instr.Block())
740 useblocks.Add(instr.Block())
741 for _, ref := range *instr.Referrers() {
742 useblocks.Add(ref.Block())
746 // The Alloc itself counts as a (zero) definition of the cell.
747 defblocks.Add(alloc.Block())
750 fmt.Fprintln(os.Stderr, "\tlifting ", alloc, alloc.Name())
755 // What follows is the body of the main loop of the insert-φ
756 // function described by Cytron et al, but instead of using
757 // counter tricks, we just reset the 'hasAlready' and 'work'
758 // sets each iteration. These are bitmaps so it's pretty cheap.
760 // Initialize W and work to defblocks.
762 for change := true; change; {
765 // Traverse iterated dominance frontier, inserting φ-nodes.
768 for i := W.Take(); i != -1; i = W.Take() {
770 for _, y := range df[n.Index] {
772 if len(*alloc.Referrers()) == 0 {
779 for _, ref := range *alloc.Referrers() {
780 if _, ok := ref.(*Load); ok {
781 if closure.has(y, ref.Block()) {
793 // It will be prepended to v.Instrs later, if needed.
795 Edges: make([]Value, len(y.Preds)),
798 phi.source = alloc.source
799 phi.setType(deref(alloc.Type()))
802 fmt.Fprintf(os.Stderr, "\tplace %s = %s at block %s\n", phi.Name(), phi, y)
804 newPhis[y.Index] = append(newPhis[y.Index], newPhi{phi, alloc})
806 for _, p := range y.Preds {
810 if defblocks.Add(y) {
820 for i := W.Take(); i != -1; i = W.Take() {
822 for _, y := range rdf[n.Index] {
824 sigmas := make([]*Sigma, 0, len(y.Succs))
826 for _, succ := range y.Succs {
828 for _, ref := range *alloc.Referrers() {
829 if closure == nil || closure.has(succ, ref.Block()) {
840 sigma.source = alloc.source
841 sigma.setType(deref(alloc.Type()))
843 sigmas = append(sigmas, sigma)
845 sigmas = append(sigmas, nil)
850 newSigmas[y.Index] = append(newSigmas[y.Index], newSigma{alloc, sigmas})
851 for _, s := range y.Succs {
855 if useblocks.Add(y) {
866 // replaceAll replaces all intraprocedural uses of x with y,
867 // updating x.Referrers and y.Referrers.
868 // Precondition: x.Referrers() != nil, i.e. x must be local to some function.
870 func replaceAll(x, y Value) {
872 pxrefs := x.Referrers()
873 pyrefs := y.Referrers()
874 for _, instr := range *pxrefs {
875 rands = instr.Operands(rands[:0]) // recycle storage
876 for _, rand := range rands {
884 *pyrefs = append(*pyrefs, instr) // dups ok
887 *pxrefs = nil // x is now unreferenced
890 // renamed returns the value to which alloc is being renamed,
891 // constructing it lazily if it's the implicit zero initialization.
893 func renamed(fn *Function, renaming []Value, alloc *Alloc) Value {
894 v := renaming[alloc.index]
896 v = emitConst(fn, zeroConst(deref(alloc.Type())))
897 renaming[alloc.index] = v
902 // rename implements the Cytron et al-based SSI renaming algorithm, a
903 // preorder traversal of the dominator tree replacing all loads of
904 // Alloc cells with the value stored to that cell by the dominating
905 // store instruction.
907 // renaming is a map from *Alloc (keyed by index number) to its
908 // dominating stored value; newPhis[x] is the set of new φ-nodes to be
909 // prepended to block x.
911 func rename(u *BasicBlock, renaming []Value, newPhis newPhiMap, newSigmas newSigmaMap) {
912 // Each φ-node becomes the new name for its associated Alloc.
913 for _, np := range newPhis[u.Index] {
916 renaming[alloc.index] = phi
919 // Rename loads and stores of allocs.
920 for i, instr := range u.Instrs {
921 switch instr := instr.(type) {
923 if instr.index >= 0 { // store of zero to Alloc cell
924 // Replace dominated loads by the zero value.
925 renaming[instr.index] = nil
927 fmt.Fprintf(os.Stderr, "\tkill alloc %s\n", instr)
935 if alloc, ok := instr.Addr.(*Alloc); ok && alloc.index >= 0 { // store to Alloc cell
936 // Replace dominated loads by the stored value.
937 renaming[alloc.index] = instr.Val
939 fmt.Fprintf(os.Stderr, "\tkill store %s; new value: %s\n",
940 instr, instr.Val.Name())
942 if refs := instr.Addr.Referrers(); refs != nil {
943 *refs = removeInstr(*refs, instr)
945 if refs := instr.Val.Referrers(); refs != nil {
946 *refs = removeInstr(*refs, instr)
954 if alloc, ok := instr.X.(*Alloc); ok && alloc.index >= 0 { // load of Alloc cell
955 // In theory, we wouldn't be able to replace loads
956 // directly, because a loaded value could be used in
957 // different branches, in which case it should be
958 // replaced with different sigma nodes. But we can't
959 // simply defer replacement, either, because then
960 // later stores might incorrectly affect this load.
962 // To avoid doing renaming on _all_ values (instead of
963 // just loads and stores like we're doing), we make
964 // sure during code generation that each load is only
965 // used in one block. For example, in constant switch
966 // statements, where the tag is only evaluated once,
967 // we store it in a temporary and load it for each
968 // comparison, so that we have individual loads to
970 newval := renamed(u.Parent(), renaming, alloc)
972 fmt.Fprintf(os.Stderr, "\tupdate load %s = %s with %s\n",
973 instr.Name(), instr, newval)
975 replaceAll(instr, newval)
981 if x, ok := instr.X.(*Alloc); ok && x.index >= 0 {
983 instr.X = renamed(u.Parent(), renaming, x)
986 // Add DebugRef to instr.X's referrers.
987 if refs := instr.X.Referrers(); refs != nil {
988 *refs = append(*refs, instr)
991 // A source expression denotes the address
992 // of an Alloc that was optimized away.
995 // Delete the DebugRef.
1003 // update all outgoing sigma nodes with the dominating store
1004 for _, sigmas := range newSigmas[u.Index] {
1005 for _, sigma := range sigmas.sigmas {
1009 sigma.X = renamed(u.Parent(), renaming, sigmas.alloc)
1013 // For each φ-node in a CFG successor, rename the edge.
1014 for succi, v := range u.Succs {
1015 phis := newPhis[v.Index]
1020 for _, np := range phis {
1023 // if there's a sigma node, use it, else use the dominating value
1025 for _, sigmas := range newSigmas[u.Index] {
1026 if sigmas.alloc == alloc && sigmas.sigmas[succi] != nil {
1027 newval = sigmas.sigmas[succi]
1032 newval = renamed(u.Parent(), renaming, alloc)
1035 fmt.Fprintf(os.Stderr, "\tsetphi %s edge %s -> %s (#%d) (alloc=%s) := %s\n",
1036 phi.Name(), u, v, i, alloc.Name(), newval.Name())
1038 phi.Edges[i] = newval
1039 if prefs := newval.Referrers(); prefs != nil {
1040 *prefs = append(*prefs, phi)
1045 // Continue depth-first recursion over domtree, pushing a
1046 // fresh copy of the renaming map for each subtree.
1047 r := make([]Value, len(renaming))
1048 for _, v := range u.dom.children {
1049 // XXX add debugging
1052 // on entry to a block, the incoming sigma nodes become the new values for their alloc
1053 if idx := u.succIndex(v); idx != -1 {
1054 for _, sigma := range newSigmas[u.Index] {
1055 if sigma.sigmas[idx] != nil {
1056 r[sigma.alloc.index] = sigma.sigmas[idx]
1060 rename(v, r, newPhis, newSigmas)