diff --git a/board/zylonite/lowlevel_init.S b/board/zylonite/lowlevel_init.S
index 54c2adfaf3b50083814f9279497d1593d78b8b5e..4d62be5d5e2a9b306e88a8f71fcbe11e952fbe3d 100644
--- a/board/zylonite/lowlevel_init.S
+++ b/board/zylonite/lowlevel_init.S
@@ -39,6 +39,16 @@ DRAM_SIZE:  .long   CFG_DRAM_SIZE
+.macro wait time
+	ldr             r2, =OSCR
+	mov             r3, #0
+	str             r3, [r2]
+	ldr             r3, [r2]
+	cmp             r3, \time
+	bls             0b
  * 	Memory setup
@@ -48,7 +58,7 @@ lowlevel_init:
 	/* Set up GPIO pins first ----------------------------------------- */
 	mov      r10, lr
-        /*  GPIO41, 42, 43, 44, 45, 46, 47, 48 */
+        /*  Configure GPIO Pins 41 - 48 as UART1 / altern. Fkt. 2 */
 	ldr             r0, =0x40E10438 @ GPIO41 FFRXD
 	ldr             r1, =0x802
 	str             r1, [r0]
@@ -82,7 +92,7 @@ lowlevel_init:
 	str             r1, [r0]
         /* tebrandt - ASCR, clear the RDH bit */
-	ldr             r0, =ASCR
+	ldr             r0, =ASCR	
 	ldr             r1, [r0]
 	bic             r1, r1, #0x80000000
 	str             r1, [r0]
@@ -101,16 +111,18 @@ lowlevel_init:
 	/*         FIXME: can be optimized later                            */
 	/* ---------------------------------------------------------------- */
-	ldr r3, =OSCR			/* reset the OS Timer Count to zero */
-	mov r2, #0
-	str r2, [r3]
-	ldr r4, =0x300			/* really 0x2E1 is about 200usec,   */
-					/* so 0x300 should be plenty        */
-	ldr r2, [r3]
-	cmp r4, r2
-	bgt 1b
+	/* mk:	 replaced with wait macro */
+/* 	ldr r3, =OSCR			/\* reset the OS Timer Count to zero *\/ */
+/* 	mov r2, #0 */
+/* 	str r2, [r3] */
+/* 	ldr r4, =0x300			/\* really 0x2E1 is about 200usec,   *\/ */
+/* 					/\* so 0x300 should be plenty        *\/ */
+/* 1: */
+/* 	ldr r2, [r3] */
+/* 	cmp r4, r2 */
+/* 	bgt 1b */
+	wait #300
 	/* configure the MEMCLKCFG register */
@@ -209,8 +221,12 @@ mem_init:
 	str		r2, [r1]
 	ldr		r2, [r1]
-	/* DDR Read-Strobe Delay Calibration */
-	/* bl	ddr_calibration */
+	/* Hardware DDR Read-Strobe Delay Calibration */
+	ldr             r0, =DDR_HCAL           @ DDR_HCAL
+	ldr             r1, =0x803ffc07     @ the offset is correct? -SC
+	str             r1, [r0]
+	wait		#5
+	ldr             r1, [r0]
 	/* Here we assume the hardware calibration alwasy be successful. -SC */
 	/* Set DMCEN bit in MDCNFG Register */
@@ -220,10 +236,12 @@ mem_init:
 	str		r1, [r0]
 	/* scrub/init SDRAM if enabled/present */
-	ldr	r11, =0xa0000000 //RAM_BASE	// base address of SDRAM
-	ldr	r12, =0x04000000 // size of memory to scrub
-	mov	r8,r12		// save DRAM size
-	mov	r0, #0		// scrub with 0x0000:0000
+/* 	ldr	r11, =0xa0000000 /\* base address of SDRAM (CFG_DRAM_BASE) *\/ */
+/* 	ldr	r12, =0x04000000 /\* size of memory to scrub (CFG_DRAM_SIZE) *\/ */
+/* 	mov	r8,r12		 /\* save DRAM size (mk: why???) *\/ */
+	ldr	r8, =0xa0000000  /* base address of SDRAM (CFG_DRAM_BASE) */
+	ldr	r9, =0x04000000  /* size of memory to scrub (CFG_DRAM_SIZE) */
+	mov	r0, #0		 /* scrub with 0x0000:0000 */
 	mov	r1, #0
 	mov	r2, #0				
 	mov	r3, #0
@@ -232,8 +250,8 @@ mem_init:
 	mov	r6, #0					
 	mov	r7, #0
 10:     /* fastScrubLoop */
-	subs	r12, r12, #32	// 32 bytes/line
-	stmia	r11!, {r0-r7}
+	subs	r9, r9, #32	// 32 bytes/line
+	stmia	r8!, {r0-r7}
 	beq	15f
 	b	10b
@@ -264,3 +282,94 @@ mem_init:
     mov     pc, lr
+@ DDR calibration
+@  This function is used to calibrate DQS delay lines.
+@ Monahans supports three ways to do it. One is software 
+@ calibration. Two is hardware calibration. Three is hybrid
+@ calibration.
+@ TBD
+@ -SC
+	@ Case 1:	Write the correct delay value once
+        @ Configure DDR_SCAL Register
+	ldr             r0, =DDR_SCAL           @ DDR_SCAL
+q	ldr             r1, =0xaf2f2f2f
+	str             r1, [r0]
+	ldr             r1, [r0]
+/*	@ Case 2:	Software Calibration
+	@ Write test pattern to memory
+	ldr		r5, =0x0faf0faf         @ Data Pattern
+	ldr		r4, =0xa0000000		@ DDR ram
+	str		r5, [r4]
+	mov		r1, =0x0		@ delay count
+	mov		r6, =0x0
+	mov		r7, =0x0
+	add		r1, r1, =0x1
+	cmp		r1, =0xf
+	ble		end_loop
+	mov		r3, r1
+	mov             r0, r1, lsl #30
+	orr		r3, r3, r0
+	mov             r0, r1, lsl #22
+	orr		r3, r3, r0
+	mov             r0, r1, lsl #14
+	orr		r3, r3, r0
+	orr		r3, r3, =0x80000000
+	ldr		r2, =DDR_SCAL
+	str		r3, [r2]
+	ldr		r2, [r4]
+	cmp		r2, r5
+	bne		ddr_loop1
+	mov		r6, r1
+	add		r1, r1, =0x1
+	cmp		r1, =0xf
+	ble		end_loop
+        mov             r3, r1
+        mov             r0, r1, lsl #30
+        orr             r3, r3, r0
+        mov             r0, r1, lsl #22
+        orr             r3, r3, r0
+        mov             r0, r1, lsl #14
+        orr             r3, r3, r0
+        orr             r3, r3, =0x80000000
+        ldr             r2, =DDR_SCAL
+        str             r3, [r2]
+	ldr		r2, [r4]
+	cmp		r2, r5
+	be		ddr_loop2
+	mov		r7, r2
+	add		r3, r6, r7
+	lsr		r3, r3, =0x1
+        mov             r0, r1, lsl #30
+        orr             r3, r3, r0
+        mov             r0, r1, lsl #22
+        orr             r3, r3, r0
+        mov             r0, r1, lsl #14
+        orr             r3, r3, r0
+        orr             r3, r3, =0x80000000
+        ldr             r2, =DDR_SCAL
+	@ Case 3:	Hardware Calibratoin
+	ldr             r0, =DDR_HCAL           @ DDR_HCAL
+	ldr             r1, =0x803ffc07     @ the offset is correct? -SC
+	str             r1, [r0]
+	wait		#5
+	ldr             r1, [r0]
+	mov		pc, lr	
diff --git a/include/asm-arm/arch-pxa/hardware.h b/include/asm-arm/arch-pxa/hardware.h
index a12aea14c4c3f48786d429115bc9395f279be395..c8c479a186923d78f057d9a0d8a6089009a1b4e8 100644
--- a/include/asm-arm/arch-pxa/hardware.h
+++ b/include/asm-arm/arch-pxa/hardware.h
@@ -8,6 +8,11 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
+ *
+ * Note: This file was taken from linux-2.4.19-rmk4-pxa1
+ *
+ * - 2003/01/20 implementation specifics activated
+ *   Robert Schwebel <r.schwebel@pengutronix.de>
@@ -16,6 +21,16 @@
 #include <linux/config.h>
 #include <asm/mach-types.h>
+ * These are statically mapped PCMCIA IO space for designs using it as a
+ * generic IO bus, typically with ISA parts, hardwired IDE interfaces, etc.
+ * The actual PCMCIA code is mapping required IO region at run time.
+ */
+#define PCMCIA_IO_0_BASE	0xf6000000
+#define PCMCIA_IO_1_BASE	0xf7000000
  * We requires absolute addresses.
@@ -29,63 +44,22 @@
- * Intel PXA2xx internal register mapping:
+ * Intel PXA internal I/O mappings:
- * 0x40000000 - 0x41ffffff <--> 0xf2000000 - 0xf3ffffff
- * 0x44000000 - 0x45ffffff <--> 0xf4000000 - 0xf5ffffff
- * 0x48000000 - 0x49ffffff <--> 0xf6000000 - 0xf7ffffff
- * 0x4c000000 - 0x4dffffff <--> 0xf8000000 - 0xf9ffffff
- * 0x50000000 - 0x51ffffff <--> 0xfa000000 - 0xfbffffff
- * 0x54000000 - 0x55ffffff <--> 0xfc000000 - 0xfdffffff
- * 0x58000000 - 0x59ffffff <--> 0xfe000000 - 0xffffffff
- *
- * Note that not all PXA2xx chips implement all those addresses, and the
- * kernel only maps the minimum needed range of this mapping.
- */
-#define io_p2v(x) (0xf2000000 + ((x) & 0x01ffffff) + (((x) & 0x1c000000) >> 1))
-#define io_v2p(x) (0x3c000000 + ((x) & 0x01ffffff) + (((x) & 0x0e000000) << 1))
-/* There are too many IO area needed to map, so I divide them into 3 areas
- * 0x40000000 - 0x41ffffff <--> 0xf6000000 - 0xf7ffffff  Devs
+ * 0x40000000 - 0x41ffffff <--> 0xf8000000 - 0xf9ffffff
+ * 0x44000000 - 0x45ffffff <--> 0xfa000000 - 0xfbffffff
+ * 0x48000000 - 0x49ffffff <--> 0xfc000000 - 0xfdffffff
-#define io_p2v(x)  ((((x) & 0xfc000000)>>4) + 0xf2000000 + ((x)&0x01ffffff))
-#define io_v2p(x)  (((((x) - 0xf2000000)&0xfc000000)<<4) + ((x)&0x01ffffff))
- * 0x42000000 - 0x421fffff <--> 0xf8000000 - 0xf81fffff  MMC2 & USIM2
- * 0x43000000 - 0x430fffff <--> 0xf8200000 - 0xf82fffff  Caddo
- * 0x43100000 - 0x431fffff <--> 0xf8300000 - 0xf83fffff  NAND
- * 0x44000000 - 0x440fffff <--> 0xf8400000 - 0xf84fffff  LCD
- * 0x46000000 - 0x460fffff <--> 0xf8800000 - 0xf88fffff  Mini LCD
- * 0x48100000 - 0x481fffff <--> 0xf8d00000 - 0xf8dfffff  Dynamic Mem Ctl
- * 0x4a000000 - 0x4a0fffff <--> 0xf9000000 - 0xf90fffff  Static Mem Ctl
- * 0x4c000000 - 0x4c0fffff <--> 0xf9400000 - 0xf94fffff  USB Host
- */
-#define io_p2v_2(x)	(((((x) - 0x42000000) & 0xff000000) >> 3) + 0xf8000000\
- 			+ ((x) & 0x001fffff))
-#define io_v2p_2(x)	(((((x) & 0xffe00000) - 0xf8000000) << 3) + 0x42000000\
-				+ (x & 0x001fffff)) 
- * 0x50000000 - 0x500fffff <--> 0xfa000000 - 0xfa0fffff  Camera Interface
- * 0x54000000 - 0x540fffff <--> 0xfa400000 - 0xfa4fffff  2D Graphics Ctrl
- * 0x54100000 - 0x541fffff <--> 0xfa500000 - 0xfa5fffff  USB Device 2.0 Ctrl
- * 0x58000000 - 0x580fffff <--> 0xfa800000 - 0xfa8fffff  Internal SRAM Ctrl
- */
-#define io_p2v_3(x)	((((x) & 0xfc000000) >> 4) + 0xf5000000 + \
-				((x) & 0x001fffff)) 
-#define io_v2p_3(x)	(((((x) - 0xf5000000) & 0x0fc00000) << 4) + \
-				((x) & 0x001fffff)) 
+/* FIXME: Only this does work for u-boot... find out why... [RS] */
+#define UBOOT_REG_FIX 1
+#ifndef UBOOT_REG_FIX
 #ifndef __ASSEMBLY__
-#if 0
-# define __REG(x)	(*((volatile u32 *)io_p2v(x)))
+#define io_p2v(x)	( ((x) | 0xbe000000) ^ (~((x) >> 1) & 0x06000000) )
+#define io_v2p( x )	( ((x) & 0x41ffffff) ^ ( ((x) & 0x06000000) << 1) )
  * This __REG() version gives the same results as the one above,  except
  * that we are fooling gcc somehow so it generates far better and smaller
@@ -96,56 +70,66 @@
 typedef struct { volatile u32 offset[4096]; } __regbase;
 # define __REGP(x)	((__regbase *)((x)&~4095))->offset[((x)&4095)>>2]
 # define __REG(x)	__REGP(io_p2v(x))
-/* __REG_2 is for NAND, LCD etc.
- * __REG_3 is for Camera Interface, 2D Graphics, U2D etc.*/
-#define __REG_2(x)	__REGP(io_p2v_2(x))
-#define __REG_3(x)	__REGP(io_p2v_3(x))
-#endif /* if 0 */
-/* With indexed regs we don't want to feed the index through io_p2v()
-   especially if it is a variable, otherwise horrible code will result. */
-# define __REG2(x,y)     (*(volatile u32 *)((u32)&__REG(x) + (y)))
+/* Let's kick gcc's ass again... */
+# define __REG2(x,y)	\
+	( __builtin_constant_p(y) ? (__REG((x) + (y))) \
+				  : (*(volatile u32 *)((u32)&__REG(x) + (y))) )
 # define __PREG(x)	(io_v2p((u32)&(x)))
-#else /* ifndef __ASSEMBLY__ */
 # define __REG(x)	io_p2v(x)
 # define __PREG(x)	io_v2p(x)
-# define __REG_2(x)	io_p2v(x)
-# define __REG_3(x)	io_p2v(x)
+# undef io_p2v
+# undef __REG
+# ifndef __ASSEMBLY__
+#  define io_p2v(PhAdd)	   (PhAdd)
+#  define __REG(x)	(*((volatile u32 *)io_p2v(x)))
+#  define __REG2(x,y)	(*(volatile u32 *)((u32)&__REG(x) + (y)))
+# else
+#  define __REG(x) (x)
+#  ifdef CONFIG_CPU_MONAHANS /* Hack to make this work with mona's pxa-regs.h */
+#   define __REG_2(x) (x)
+#   define __REG_3(x) (x)
+#  endif
+# endif
+#endif /* UBOOT_REG_FIX */
-#endif /* ifndef __ASSEMBLY__ */
+#include "pxa-regs.h"
 #ifndef __ASSEMBLY__
-#include "zylonite.h"
+ * GPIO edge detection for IRQs:
+ * IRQs are generated on Falling-Edge, Rising-Edge, or both.
+ * This must be called *before* the corresponding IRQ is registered.
+ * Use this instead of directly setting GRER/GFER.
+ */
+#define GPIO_BOTH_EDGES		3
+extern void set_GPIO_IRQ_edge( int gpio_nr, int edge_mask );
  * Handy routine to set GPIO alternate functions
-extern void pxa_gpio_mode( int gpio_mode );
+extern void set_GPIO_mode( int gpio_mode );
- * Routine to enable or disable CKEN
+ * return current lclk frequency in units of 10kHz
-extern void pxa_set_cken(int clock, int enable);
+extern unsigned int get_lclk_frequency_10khz(void);
- * return current memory and LCD clock frequency in units of 10kHz
+ * Implementation specifics
-extern unsigned int get_memclk_frequency_10khz(void);
-extern unsigned int get_lcdclk_frequency_10khz(void);
-#endif /* __ASSEMBLY__ */
 #include "lubbock.h"
@@ -159,15 +143,6 @@ extern unsigned int get_lcdclk_frequency_10khz(void);
 #include "cerf.h"
-#define	__cpuc_flush_l2cache_all	xscale_flush_l2cache_all
-extern void __cpuc_flush_l2cache_all(void);
-#define	flush_l2cache_all		__cpuc_flush_l2cache_all
-#define	__cpuc_flush_l2cache_all()	do {} while (0)
-#define	flush_l2cache_all()		do {} while (0)
 #ifdef CONFIG_ARCH_CSB226
 #include "csb226.h"
@@ -180,10 +155,4 @@ extern void __cpuc_flush_l2cache_all(void);
 #include "pleb.h"
-#include "mainstone.h"
-#include "pxa-regs.h"
-#endif  /* _ASM_ARCH_HARDWARE_H */
+#endif	/* _ASM_ARCH_HARDWARE_H */
diff --git a/include/asm-arm/global_data.h b/include/asm-arm/global_data.h
index c2d52915a85fc4816dcff69c47e240a98462de8a..b3c7084352eae61beb8cabd4a79952d11176aaa9 100644
--- a/include/asm-arm/global_data.h
+++ b/include/asm-arm/global_data.h
@@ -61,6 +61,11 @@ typedef	struct	global_data {
 #define	GD_FLG_DEVINIT	0x00002		/* Devices have been initialized	*/
 #define	GD_FLG_SILENT	0x00004		/* Silent mode				*/
-#define DECLARE_GLOBAL_DATA_PTR     register volatile gd_t *gd asm ("r8")
+#define GCC_4_SCREW_GDP 1
+#ifdef GCC_4_SCREW_GDP
+# define DECLARE_GLOBAL_DATA_PTR     register gd_t* volatile gd asm ("r8");
+# define DECLARE_GLOBAL_DATA_PTR     register volatile gd_t *gd asm ("r8")
 #endif /* __ASM_GBL_DATA_H */