// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package ir

// This file defines the lifting pass which tries to "lift" Alloc
// cells (new/local variables) into SSA registers, replacing loads
// with the dominating stored value, eliminating loads and stores, and
// inserting φ- and σ-nodes as needed.

// Cited papers and resources:
//
// Ron Cytron et al. 1991. Efficiently computing SSA form...
// http://doi.acm.org/10.1145/115372.115320
//
// Cooper, Harvey, Kennedy.  2001.  A Simple, Fast Dominance Algorithm.
// Software Practice and Experience 2001, 4:1-10.
// http://www.hipersoft.rice.edu/grads/publications/dom14.pdf
//
// Daniel Berlin, llvmdev mailing list, 2012.
// http://lists.cs.uiuc.edu/pipermail/llvmdev/2012-January/046638.html
// (Be sure to expand the whole thread.)
//
// C. Scott Ananian. 1997. The static single information form.
//
// Jeremy Singer. 2006. Static program analysis based on virtual register renaming.

// TODO(adonovan): opt: there are many optimizations worth evaluating, and
// the conventional wisdom for SSA construction is that a simple
// algorithm well engineered often beats those of better asymptotic
// complexity on all but the most egregious inputs.
//
// Danny Berlin suggests that the Cooper et al. algorithm for
// computing the dominance frontier is superior to Cytron et al.
// Furthermore he recommends that rather than computing the DF for the
// whole function then renaming all alloc cells, it may be cheaper to
// compute the DF for each alloc cell separately and throw it away.
//
// Consider exploiting liveness information to avoid creating dead
// φ-nodes which we then immediately remove.
//
// Also see many other "TODO: opt" suggestions in the code.

import (
	"fmt"
	"go/types"
	"os"
)

// If true, show diagnostic information at each step of lifting.
// Very verbose.
const debugLifting = false

// domFrontier maps each block to the set of blocks in its dominance
// frontier.  The outer slice is conceptually a map keyed by
// Block.Index.  The inner slice is conceptually a set, possibly
// containing duplicates.
//
// TODO(adonovan): opt: measure impact of dups; consider a packed bit
// representation, e.g. big.Int, and bitwise parallel operations for
// the union step in the Children loop.
//
// domFrontier's methods mutate the slice's elements but not its
// length, so their receivers needn't be pointers.
//
type domFrontier [][]*BasicBlock

func (df domFrontier) add(u, v *BasicBlock) {
	df[u.Index] = append(df[u.Index], v)
}

// build builds the dominance frontier df for the dominator tree of
// fn, using the algorithm found in A Simple, Fast Dominance
// Algorithm, Figure 5.
//
// TODO(adonovan): opt: consider Berlin approach, computing pruned SSA
// by pruning the entire IDF computation, rather than merely pruning
// the DF -> IDF step.
func (df domFrontier) build(fn *Function) {
	for _, b := range fn.Blocks {
		if len(b.Preds) >= 2 {
			for _, p := range b.Preds {
				runner := p
				for runner != b.dom.idom {
					df.add(runner, b)
					runner = runner.dom.idom
				}
			}
		}
	}
}

func buildDomFrontier(fn *Function) domFrontier {
	df := make(domFrontier, len(fn.Blocks))
	df.build(fn)
	return df
}

type postDomFrontier [][]*BasicBlock

func (rdf postDomFrontier) add(u, v *BasicBlock) {
	rdf[u.Index] = append(rdf[u.Index], v)
}

func (rdf postDomFrontier) build(fn *Function) {
	for _, b := range fn.Blocks {
		if len(b.Succs) >= 2 {
			for _, s := range b.Succs {
				runner := s
				for runner != b.pdom.idom {
					rdf.add(runner, b)
					runner = runner.pdom.idom
				}
			}
		}
	}
}

func buildPostDomFrontier(fn *Function) postDomFrontier {
	rdf := make(postDomFrontier, len(fn.Blocks))
	rdf.build(fn)
	return rdf
}

func removeInstr(refs []Instruction, instr Instruction) []Instruction {
	i := 0
	for _, ref := range refs {
		if ref == instr {
			continue
		}
		refs[i] = ref
		i++
	}
	for j := i; j != len(refs); j++ {
		refs[j] = nil // aid GC
	}
	return refs[:i]
}

func clearInstrs(instrs []Instruction) {
	for i := range instrs {
		instrs[i] = nil
	}
}

// lift replaces local and new Allocs accessed only with
// load/store by IR registers, inserting φ- and σ-nodes where necessary.
// The result is a program in pruned SSI form.
//
// Preconditions:
// - fn has no dead blocks (blockopt has run).
// - Def/use info (Operands and Referrers) is up-to-date.
// - The dominator tree is up-to-date.
//
func lift(fn *Function) {
	// TODO(adonovan): opt: lots of little optimizations may be
	// worthwhile here, especially if they cause us to avoid
	// buildDomFrontier.  For example:
	//
	// - Alloc never loaded?  Eliminate.
	// - Alloc never stored?  Replace all loads with a zero constant.
	// - Alloc stored once?  Replace loads with dominating store;
	//   don't forget that an Alloc is itself an effective store
	//   of zero.
	// - Alloc used only within a single block?
	//   Use degenerate algorithm avoiding φ-nodes.
	// - Consider synergy with scalar replacement of aggregates (SRA).
	//   e.g. *(&x.f) where x is an Alloc.
	//   Perhaps we'd get better results if we generated this as x.f
	//   i.e. Field(x, .f) instead of Load(FieldIndex(x, .f)).
	//   Unclear.
	//
	// But we will start with the simplest correct code.
	var df domFrontier
	var rdf postDomFrontier
	var closure *closure
	var newPhis newPhiMap
	var newSigmas newSigmaMap

	// During this pass we will replace some BasicBlock.Instrs
	// (allocs, loads and stores) with nil, keeping a count in
	// BasicBlock.gaps.  At the end we will reset Instrs to the
	// concatenation of all non-dead newPhis and non-nil Instrs
	// for the block, reusing the original array if space permits.

	// While we're here, we also eliminate 'rundefers'
	// instructions in functions that contain no 'defer'
	// instructions.
	usesDefer := false

	// Determine which allocs we can lift and number them densely.
	// The renaming phase uses this numbering for compact maps.
	numAllocs := 0
	for _, b := range fn.Blocks {
		b.gaps = 0
		b.rundefers = 0
		for _, instr := range b.Instrs {
			switch instr := instr.(type) {
			case *Alloc:
				if !liftable(instr) {
					instr.index = -1
					continue
				}
				index := -1
				if numAllocs == 0 {
					df = buildDomFrontier(fn)
					rdf = buildPostDomFrontier(fn)
					if len(fn.Blocks) > 2 {
						closure = transitiveClosure(fn)
					}
					newPhis = make(newPhiMap, len(fn.Blocks))
					newSigmas = make(newSigmaMap, len(fn.Blocks))

					if debugLifting {
						title := false
						for i, blocks := range df {
							if blocks != nil {
								if !title {
									fmt.Fprintf(os.Stderr, "Dominance frontier of %s:\n", fn)
									title = true
								}
								fmt.Fprintf(os.Stderr, "\t%s: %s\n", fn.Blocks[i], blocks)
							}
						}
					}
				}
				liftAlloc(closure, df, rdf, instr, newPhis, newSigmas)
				index = numAllocs
				numAllocs++
				instr.index = index
			case *Defer:
				usesDefer = true
			case *RunDefers:
				b.rundefers++
			}
		}
	}

	if numAllocs > 0 {
		// renaming maps an alloc (keyed by index) to its replacement
		// value.  Initially the renaming contains nil, signifying the
		// zero constant of the appropriate type; we construct the
		// Const lazily at most once on each path through the domtree.
		// TODO(adonovan): opt: cache per-function not per subtree.
		renaming := make([]Value, numAllocs)

		// Renaming.
		rename(fn.Blocks[0], renaming, newPhis, newSigmas)

		simplifyPhis(newPhis)

		// Eliminate dead φ- and σ-nodes.
		markLiveNodes(fn.Blocks, newPhis, newSigmas)
	}

	// Prepend remaining live φ-nodes to each block and possibly kill rundefers.
	for _, b := range fn.Blocks {
		var head []Instruction
		if numAllocs > 0 {
			nps := newPhis[b.Index]
			head = make([]Instruction, 0, len(nps))
			for _, pred := range b.Preds {
				nss := newSigmas[pred.Index]
				idx := pred.succIndex(b)
				for _, newSigma := range nss {
					if sigma := newSigma.sigmas[idx]; sigma != nil && sigma.live {
						head = append(head, sigma)

						// we didn't populate referrers before, as most
						// sigma nodes will be killed
						if refs := sigma.X.Referrers(); refs != nil {
							*refs = append(*refs, sigma)
						}
					} else if sigma != nil {
						sigma.block = nil
					}
				}
			}
			for _, np := range nps {
				if np.phi.live {
					head = append(head, np.phi)
				} else {
					for _, edge := range np.phi.Edges {
						if refs := edge.Referrers(); refs != nil {
							*refs = removeInstr(*refs, np.phi)
						}
					}
					np.phi.block = nil
				}
			}
		}

		rundefersToKill := b.rundefers
		if usesDefer {
			rundefersToKill = 0
		}

		j := len(head)
		if j+b.gaps+rundefersToKill == 0 {
			continue // fast path: no new phis or gaps
		}

		// We could do straight copies instead of element-wise copies
		// when both b.gaps and rundefersToKill are zero. However,
		// that seems to only be the case ~1% of the time, which
		// doesn't seem worth the extra branch.

		// Remove dead instructions, add phis and sigmas
		ns := len(b.Instrs) + j - b.gaps - rundefersToKill
		if ns <= cap(b.Instrs) {
			// b.Instrs has enough capacity to store all instructions

			// OPT(dh): check cap vs the actually required space; if
			// there is a big enough difference, it may be worth
			// allocating a new slice, to avoid pinning memory.
			dst := b.Instrs[:cap(b.Instrs)]
			i := len(dst) - 1
			for n := len(b.Instrs) - 1; n >= 0; n-- {
				instr := dst[n]
				if instr == nil {
					continue
				}
				if !usesDefer {
					if _, ok := instr.(*RunDefers); ok {
						continue
					}
				}
				dst[i] = instr
				i--
			}
			off := i + 1 - len(head)
			// aid GC
			clearInstrs(dst[:off])
			dst = dst[off:]
			copy(dst, head)
			b.Instrs = dst
		} else {
			// not enough space, so allocate a new slice and copy
			// over.
			dst := make([]Instruction, ns)
			copy(dst, head)

			for _, instr := range b.Instrs {
				if instr == nil {
					continue
				}
				if !usesDefer {
					if _, ok := instr.(*RunDefers); ok {
						continue
					}
				}
				dst[j] = instr
				j++
			}
			b.Instrs = dst
		}
	}

	// Remove any fn.Locals that were lifted.
	j := 0
	for _, l := range fn.Locals {
		if l.index < 0 {
			fn.Locals[j] = l
			j++
		}
	}
	// Nil out fn.Locals[j:] to aid GC.
	for i := j; i < len(fn.Locals); i++ {
		fn.Locals[i] = nil
	}
	fn.Locals = fn.Locals[:j]
}

func hasDirectReferrer(instr Instruction) bool {
	for _, instr := range *instr.Referrers() {
		switch instr.(type) {
		case *Phi, *Sigma:
			// ignore
		default:
			return true
		}
	}
	return false
}

func markLiveNodes(blocks []*BasicBlock, newPhis newPhiMap, newSigmas newSigmaMap) {
	// Phi and sigma nodes are considered live if a non-phi, non-sigma
	// node uses them. Once we find a node that is live, we mark all
	// of its operands as used, too.
	for _, npList := range newPhis {
		for _, np := range npList {
			phi := np.phi
			if !phi.live && hasDirectReferrer(phi) {
				markLivePhi(phi)
			}
		}
	}
	for _, npList := range newSigmas {
		for _, np := range npList {
			for _, sigma := range np.sigmas {
				if sigma != nil && !sigma.live && hasDirectReferrer(sigma) {
					markLiveSigma(sigma)
				}
			}
		}
	}
	// Existing φ-nodes due to && and || operators
	// are all considered live (see Go issue 19622).
	for _, b := range blocks {
		for _, phi := range b.phis() {
			markLivePhi(phi.(*Phi))
		}
	}
}

func markLivePhi(phi *Phi) {
	phi.live = true
	for _, rand := range phi.Edges {
		switch rand := rand.(type) {
		case *Phi:
			if !rand.live {
				markLivePhi(rand)
			}
		case *Sigma:
			if !rand.live {
				markLiveSigma(rand)
			}
		}
	}
}

func markLiveSigma(sigma *Sigma) {
	sigma.live = true
	switch rand := sigma.X.(type) {
	case *Phi:
		if !rand.live {
			markLivePhi(rand)
		}
	case *Sigma:
		if !rand.live {
			markLiveSigma(rand)
		}
	}
}

// simplifyPhis replaces trivial phis with non-phi alternatives. Phi
// nodes where all edges are identical, or consist of only the phi
// itself and one other value, may be replaced with the value.
func simplifyPhis(newPhis newPhiMap) {
	// find all phis that are trivial and can be replaced with a
	// non-phi value. run until we reach a fixpoint, because replacing
	// a phi may make other phis trivial.
	for changed := true; changed; {
		changed = false
		for _, npList := range newPhis {
			for _, np := range npList {
				if np.phi.live {
					// we're reusing 'live' to mean 'dead' in the context of simplifyPhis
					continue
				}
				if r, ok := isUselessPhi(np.phi); ok {
					// useless phi, replace its uses with the
					// replacement value. the dead phi pass will clean
					// up the phi afterwards.
					replaceAll(np.phi, r)
					np.phi.live = true
					changed = true
				}
			}
		}
	}

	for _, npList := range newPhis {
		for _, np := range npList {
			np.phi.live = false
		}
	}
}

type BlockSet struct {
	idx    int
	values []bool
	count  int
}

func NewBlockSet(size int) *BlockSet {
	return &BlockSet{values: make([]bool, size)}
}

func (s *BlockSet) Set(s2 *BlockSet) {
	copy(s.values, s2.values)
	s.count = 0
	for _, v := range s.values {
		if v {
			s.count++
		}
	}
}

func (s *BlockSet) Num() int {
	return s.count
}

func (s *BlockSet) Has(b *BasicBlock) bool {
	if b.Index >= len(s.values) {
		return false
	}
	return s.values[b.Index]
}

// add adds b to the set and returns true if the set changed.
func (s *BlockSet) Add(b *BasicBlock) bool {
	if s.values[b.Index] {
		return false
	}
	s.count++
	s.values[b.Index] = true
	s.idx = b.Index

	return true
}

func (s *BlockSet) Clear() {
	for j := range s.values {
		s.values[j] = false
	}
	s.count = 0
}

// take removes an arbitrary element from a set s and
// returns its index, or returns -1 if empty.
func (s *BlockSet) Take() int {
	// [i, end]
	for i := s.idx; i < len(s.values); i++ {
		if s.values[i] {
			s.values[i] = false
			s.idx = i
			s.count--
			return i
		}
	}

	// [start, i)
	for i := 0; i < s.idx; i++ {
		if s.values[i] {
			s.values[i] = false
			s.idx = i
			s.count--
			return i
		}
	}

	return -1
}

type closure struct {
	span       []uint32
	reachables []interval
}

type interval uint32

const (
	flagMask   = 1 << 31
	numBits    = 20
	lengthBits = 32 - numBits - 1
	lengthMask = (1<<lengthBits - 1) << numBits
	numMask    = 1<<numBits - 1
)

func (c closure) has(s, v *BasicBlock) bool {
	idx := uint32(v.Index)
	if idx == 1 || s.Dominates(v) {
		return true
	}
	r := c.reachable(s.Index)
	for i := 0; i < len(r); i++ {
		inv := r[i]
		var start, end uint32
		if inv&flagMask == 0 {
			// small interval
			start = uint32(inv & numMask)
			end = start + uint32(inv&lengthMask)>>numBits
		} else {
			// large interval
			i++
			start = uint32(inv & numMask)
			end = uint32(r[i])
		}
		if idx >= start && idx <= end {
			return true
		}
	}
	return false
}

func (c closure) reachable(id int) []interval {
	return c.reachables[c.span[id]:c.span[id+1]]
}

func (c closure) walk(current *BasicBlock, b *BasicBlock, visited []bool) {
	visited[b.Index] = true
	for _, succ := range b.Succs {
		if visited[succ.Index] {
			continue
		}
		visited[succ.Index] = true
		c.walk(current, succ, visited)
	}
}

func transitiveClosure(fn *Function) *closure {
	reachable := make([]bool, len(fn.Blocks))
	c := &closure{}
	c.span = make([]uint32, len(fn.Blocks)+1)

	addInterval := func(start, end uint32) {
		if l := end - start; l <= 1<<lengthBits-1 {
			n := interval(l<<numBits | start)
			c.reachables = append(c.reachables, n)
		} else {
			n1 := interval(1<<31 | start)
			n2 := interval(end)
			c.reachables = append(c.reachables, n1, n2)
		}
	}

	for i, b := range fn.Blocks[1:] {
		for i := range reachable {
			reachable[i] = false
		}

		c.walk(b, b, reachable)
		start := ^uint32(0)
		for id, isReachable := range reachable {
			if !isReachable {
				if start != ^uint32(0) {
					end := uint32(id) - 1
					addInterval(start, end)
					start = ^uint32(0)
				}
				continue
			} else if start == ^uint32(0) {
				start = uint32(id)
			}
		}
		if start != ^uint32(0) {
			addInterval(start, uint32(len(reachable))-1)
		}

		c.span[i+2] = uint32(len(c.reachables))
	}

	return c
}

// newPhi is a pair of a newly introduced φ-node and the lifted Alloc
// it replaces.
type newPhi struct {
	phi   *Phi
	alloc *Alloc
}

type newSigma struct {
	alloc  *Alloc
	sigmas []*Sigma
}

// newPhiMap records for each basic block, the set of newPhis that
// must be prepended to the block.
type newPhiMap [][]newPhi
type newSigmaMap [][]newSigma

func liftable(alloc *Alloc) bool {
	// Don't lift aggregates into registers, because we don't have
	// a way to express their zero-constants.
	switch deref(alloc.Type()).Underlying().(type) {
	case *types.Array, *types.Struct:
		return false
	}

	fn := alloc.Parent()
	// Don't lift named return values in functions that defer
	// calls that may recover from panic.
	if fn.hasDefer {
		for _, nr := range fn.namedResults {
			if nr == alloc {
				return false
			}
		}
	}

	for _, instr := range *alloc.Referrers() {
		switch instr := instr.(type) {
		case *Store:
			if instr.Val == alloc {
				return false // address used as value
			}
			if instr.Addr != alloc {
				panic("Alloc.Referrers is inconsistent")
			}
		case *Load:
			if instr.X != alloc {
				panic("Alloc.Referrers is inconsistent")
			}

		case *DebugRef:
			// ok
		default:
			return false
		}
	}

	return true
}

// liftAlloc determines whether alloc can be lifted into registers,
// and if so, it populates newPhis with all the φ-nodes it may require
// and returns true.
func liftAlloc(closure *closure, df domFrontier, rdf postDomFrontier, alloc *Alloc, newPhis newPhiMap, newSigmas newSigmaMap) {
	fn := alloc.Parent()

	defblocks := fn.blockset(0)
	useblocks := fn.blockset(1)
	Aphi := fn.blockset(2)
	Asigma := fn.blockset(3)
	W := fn.blockset(4)

	// Compute defblocks, the set of blocks containing a
	// definition of the alloc cell.
	for _, instr := range *alloc.Referrers() {
		// Bail out if we discover the alloc is not liftable;
		// the only operations permitted to use the alloc are
		// loads/stores into the cell, and DebugRef.
		switch instr := instr.(type) {
		case *Store:
			defblocks.Add(instr.Block())
		case *Load:
			useblocks.Add(instr.Block())
			for _, ref := range *instr.Referrers() {
				useblocks.Add(ref.Block())
			}
		}
	}
	// The Alloc itself counts as a (zero) definition of the cell.
	defblocks.Add(alloc.Block())

	if debugLifting {
		fmt.Fprintln(os.Stderr, "\tlifting ", alloc, alloc.Name())
	}

	// Φ-insertion.
	//
	// What follows is the body of the main loop of the insert-φ
	// function described by Cytron et al, but instead of using
	// counter tricks, we just reset the 'hasAlready' and 'work'
	// sets each iteration.  These are bitmaps so it's pretty cheap.

	// Initialize W and work to defblocks.

	for change := true; change; {
		change = false
		{
			// Traverse iterated dominance frontier, inserting φ-nodes.
			W.Set(defblocks)

			for i := W.Take(); i != -1; i = W.Take() {
				n := fn.Blocks[i]
				for _, y := range df[n.Index] {
					if Aphi.Add(y) {
						if len(*alloc.Referrers()) == 0 {
							continue
						}
						live := false
						if closure == nil {
							live = true
						} else {
							for _, ref := range *alloc.Referrers() {
								if _, ok := ref.(*Load); ok {
									if closure.has(y, ref.Block()) {
										live = true
										break
									}
								}
							}
						}
						if !live {
							continue
						}

						// Create φ-node.
						// It will be prepended to v.Instrs later, if needed.
						phi := &Phi{
							Edges: make([]Value, len(y.Preds)),
						}

						phi.source = alloc.source
						phi.setType(deref(alloc.Type()))
						phi.block = y
						if debugLifting {
							fmt.Fprintf(os.Stderr, "\tplace %s = %s at block %s\n", phi.Name(), phi, y)
						}
						newPhis[y.Index] = append(newPhis[y.Index], newPhi{phi, alloc})

						for _, p := range y.Preds {
							useblocks.Add(p)
						}
						change = true
						if defblocks.Add(y) {
							W.Add(y)
						}
					}
				}
			}
		}

		{
			W.Set(useblocks)
			for i := W.Take(); i != -1; i = W.Take() {
				n := fn.Blocks[i]
				for _, y := range rdf[n.Index] {
					if Asigma.Add(y) {
						sigmas := make([]*Sigma, 0, len(y.Succs))
						anyLive := false
						for _, succ := range y.Succs {
							live := false
							for _, ref := range *alloc.Referrers() {
								if closure == nil || closure.has(succ, ref.Block()) {
									live = true
									anyLive = true
									break
								}
							}
							if live {
								sigma := &Sigma{
									From: y,
									X:    alloc,
								}
								sigma.source = alloc.source
								sigma.setType(deref(alloc.Type()))
								sigma.block = succ
								sigmas = append(sigmas, sigma)
							} else {
								sigmas = append(sigmas, nil)
							}
						}

						if anyLive {
							newSigmas[y.Index] = append(newSigmas[y.Index], newSigma{alloc, sigmas})
							for _, s := range y.Succs {
								defblocks.Add(s)
							}
							change = true
							if useblocks.Add(y) {
								W.Add(y)
							}
						}
					}
				}
			}
		}
	}
}

// replaceAll replaces all intraprocedural uses of x with y,
// updating x.Referrers and y.Referrers.
// Precondition: x.Referrers() != nil, i.e. x must be local to some function.
//
func replaceAll(x, y Value) {
	var rands []*Value
	pxrefs := x.Referrers()
	pyrefs := y.Referrers()
	for _, instr := range *pxrefs {
		rands = instr.Operands(rands[:0]) // recycle storage
		for _, rand := range rands {
			if *rand != nil {
				if *rand == x {
					*rand = y
				}
			}
		}
		if pyrefs != nil {
			*pyrefs = append(*pyrefs, instr) // dups ok
		}
	}
	*pxrefs = nil // x is now unreferenced
}

// renamed returns the value to which alloc is being renamed,
// constructing it lazily if it's the implicit zero initialization.
//
func renamed(fn *Function, renaming []Value, alloc *Alloc) Value {
	v := renaming[alloc.index]
	if v == nil {
		v = emitConst(fn, zeroConst(deref(alloc.Type())))
		renaming[alloc.index] = v
	}
	return v
}

// rename implements the Cytron et al-based SSI renaming algorithm, a
// preorder traversal of the dominator tree replacing all loads of
// Alloc cells with the value stored to that cell by the dominating
// store instruction.
//
// renaming is a map from *Alloc (keyed by index number) to its
// dominating stored value; newPhis[x] is the set of new φ-nodes to be
// prepended to block x.
//
func rename(u *BasicBlock, renaming []Value, newPhis newPhiMap, newSigmas newSigmaMap) {
	// Each φ-node becomes the new name for its associated Alloc.
	for _, np := range newPhis[u.Index] {
		phi := np.phi
		alloc := np.alloc
		renaming[alloc.index] = phi
	}

	// Rename loads and stores of allocs.
	for i, instr := range u.Instrs {
		switch instr := instr.(type) {
		case *Alloc:
			if instr.index >= 0 { // store of zero to Alloc cell
				// Replace dominated loads by the zero value.
				renaming[instr.index] = nil
				if debugLifting {
					fmt.Fprintf(os.Stderr, "\tkill alloc %s\n", instr)
				}
				// Delete the Alloc.
				u.Instrs[i] = nil
				u.gaps++
			}

		case *Store:
			if alloc, ok := instr.Addr.(*Alloc); ok && alloc.index >= 0 { // store to Alloc cell
				// Replace dominated loads by the stored value.
				renaming[alloc.index] = instr.Val
				if debugLifting {
					fmt.Fprintf(os.Stderr, "\tkill store %s; new value: %s\n",
						instr, instr.Val.Name())
				}
				if refs := instr.Addr.Referrers(); refs != nil {
					*refs = removeInstr(*refs, instr)
				}
				if refs := instr.Val.Referrers(); refs != nil {
					*refs = removeInstr(*refs, instr)
				}
				// Delete the Store.
				u.Instrs[i] = nil
				u.gaps++
			}

		case *Load:
			if alloc, ok := instr.X.(*Alloc); ok && alloc.index >= 0 { // load of Alloc cell
				// In theory, we wouldn't be able to replace loads
				// directly, because a loaded value could be used in
				// different branches, in which case it should be
				// replaced with different sigma nodes. But we can't
				// simply defer replacement, either, because then
				// later stores might incorrectly affect this load.
				//
				// To avoid doing renaming on _all_ values (instead of
				// just loads and stores like we're doing), we make
				// sure during code generation that each load is only
				// used in one block. For example, in constant switch
				// statements, where the tag is only evaluated once,
				// we store it in a temporary and load it for each
				// comparison, so that we have individual loads to
				// replace.
				newval := renamed(u.Parent(), renaming, alloc)
				if debugLifting {
					fmt.Fprintf(os.Stderr, "\tupdate load %s = %s with %s\n",
						instr.Name(), instr, newval)
				}
				replaceAll(instr, newval)
				u.Instrs[i] = nil
				u.gaps++
			}

		case *DebugRef:
			if x, ok := instr.X.(*Alloc); ok && x.index >= 0 {
				if instr.IsAddr {
					instr.X = renamed(u.Parent(), renaming, x)
					instr.IsAddr = false

					// Add DebugRef to instr.X's referrers.
					if refs := instr.X.Referrers(); refs != nil {
						*refs = append(*refs, instr)
					}
				} else {
					// A source expression denotes the address
					// of an Alloc that was optimized away.
					instr.X = nil

					// Delete the DebugRef.
					u.Instrs[i] = nil
					u.gaps++
				}
			}
		}
	}

	// update all outgoing sigma nodes with the dominating store
	for _, sigmas := range newSigmas[u.Index] {
		for _, sigma := range sigmas.sigmas {
			if sigma == nil {
				continue
			}
			sigma.X = renamed(u.Parent(), renaming, sigmas.alloc)
		}
	}

	// For each φ-node in a CFG successor, rename the edge.
	for succi, v := range u.Succs {
		phis := newPhis[v.Index]
		if len(phis) == 0 {
			continue
		}
		i := v.predIndex(u)
		for _, np := range phis {
			phi := np.phi
			alloc := np.alloc
			// if there's a sigma node, use it, else use the dominating value
			var newval Value
			for _, sigmas := range newSigmas[u.Index] {
				if sigmas.alloc == alloc && sigmas.sigmas[succi] != nil {
					newval = sigmas.sigmas[succi]
					break
				}
			}
			if newval == nil {
				newval = renamed(u.Parent(), renaming, alloc)
			}
			if debugLifting {
				fmt.Fprintf(os.Stderr, "\tsetphi %s edge %s -> %s (#%d) (alloc=%s) := %s\n",
					phi.Name(), u, v, i, alloc.Name(), newval.Name())
			}
			phi.Edges[i] = newval
			if prefs := newval.Referrers(); prefs != nil {
				*prefs = append(*prefs, phi)
			}
		}
	}

	// Continue depth-first recursion over domtree, pushing a
	// fresh copy of the renaming map for each subtree.
	r := make([]Value, len(renaming))
	for _, v := range u.dom.children {
		// XXX add debugging
		copy(r, renaming)

		// on entry to a block, the incoming sigma nodes become the new values for their alloc
		if idx := u.succIndex(v); idx != -1 {
			for _, sigma := range newSigmas[u.Index] {
				if sigma.sigmas[idx] != nil {
					r[sigma.alloc.index] = sigma.sigmas[idx]
				}
			}
		}
		rename(v, r, newPhis, newSigmas)
	}

}