diff --git a/arch/blackfin/cpu/start.S b/arch/blackfin/cpu/start.S
index 7a3abba21c4bbf7ebf562e247a1bafbe68dd9186..15ecb1e8ebcd4fe7a017037f00ce96b7bbd8352a 100644
--- a/arch/blackfin/cpu/start.S
+++ b/arch/blackfin/cpu/start.S
@@ -52,6 +52,19 @@ ENTRY(_start)
 	sp.l = LO(L1_SRAM_SCRATCH_END - 20);
 	sp.h = HI(L1_SRAM_SCRATCH_END - 20);
 
+	/* Optimization register tricks: keep a base value in the
+	 * reserved P registers so we use the load/store with an
+	 * offset syntax.  R0 = [P5 + <constant>];
+	 *   P4 - system MMR base
+	 *   P5 - core MMR base
+	 */
+#ifdef CONFIG_HW_WATCHDOG
+	p4.l = 0;
+	p4.h = HI(SYSMMR_BASE);
+#endif
+	p5.l = 0;
+	p5.h = HI(COREMMR_BASE);
+
 #ifdef CONFIG_HW_WATCHDOG
 # ifndef CONFIG_HW_WATCHDOG_TIMEOUT_START
 #  define CONFIG_HW_WATCHDOG_TIMEOUT_START 5000
@@ -60,13 +73,11 @@ ENTRY(_start)
 	 * That should be long enough to bootstrap ourselves up and
 	 * then the common u-boot code can take over.
 	 */
-	P0.L = LO(WDOG_CNT);
-	P0.H = HI(WDOG_CNT);
-	R0.L = 0;
-	R0.H = HI(MSEC_TO_SCLK(CONFIG_HW_WATCHDOG_TIMEOUT_START));
-	[P0] = R0;
+	r0 = 0;
+	r0.h = HI(MSEC_TO_SCLK(CONFIG_HW_WATCHDOG_TIMEOUT_START));
+	[p4 + (WDOG_CNT - SYSMMR_BASE)] = r0;
 	/* fire up the watchdog - R0.L above needs to be 0x0000 */
-	W[P0 + (WDOG_CTL - WDOG_CNT)] = R0;
+	W[p4 + (WDOG_CTL - SYSMMR_BASE)] = r0;
 #endif
 
 	/* Turn on the serial for debugging the init process */
@@ -121,6 +132,18 @@ ENTRY(_start)
 	if cc jump .Lnorelocate;
 	r6 = 0 (x);
 
+	/* Turn off caches as they require CPLBs and a CPLB miss requires
+	 * a software exception handler to process it.  But we're about to
+	 * clobber any previous executing software (like U-Boot that just
+	 * launched a new U-Boot via 'go'), so any handler state will be
+	 * unreliable after the memcpy below.
+	 */
+	serial_early_puts("Kill Caches");
+	r0 = 0;
+	[p5 + (IMEM_CONTROL - COREMMR_BASE)] = r0;
+	[p5 + (DMEM_CONTROL - COREMMR_BASE)] = r0;
+	ssync;
+
 	/* In bypass mode, we don't have an LDR with an init block
 	 * so we need to explicitly call it ourselves.  This will
 	 * reprogram our clocks, memory, and setup our async banks.
@@ -204,17 +227,15 @@ ENTRY(_start)
 	serial_early_puts("Lower to 15");
 	r0 = r7;
 	r1 = r6;
-	p0.l = LO(EVT15);
-	p0.h = HI(EVT15);
 	p1.l = .Lenable_nested;
 	p1.h = .Lenable_nested;
-	[p0] = p1;
+	[p5 + (EVT15 - COREMMR_BASE)] = p1;
 	r7 = EVT_IVG15 (z);
 	sti r7;
 	raise 15;
-	p4.l = .LWAIT_HERE;
-	p4.h = .LWAIT_HERE;
-	reti = p4;
+	p3.l = .LWAIT_HERE;
+	p3.h = .LWAIT_HERE;
+	reti = p3;
 	rti;
 
 	/* Enable nested interrupts before continuing with cpu init */