Fix userland ELF loader for zero sized BSS.
[qemu] / target-i386 / translate.c
index 3ab09a3..ad18af9 100644 (file)
@@ -24,7 +24,6 @@
 #include <inttypes.h>
 #include <signal.h>
 #include <assert.h>
-#include <sys/mman.h>
 
 #include "cpu.h"
 #include "exec-all.h"
@@ -40,18 +39,51 @@ static uint32_t *gen_opparam_ptr;
 #define PREFIX_DATA   0x08
 #define PREFIX_ADR    0x10
 
+#ifdef TARGET_X86_64
+#define X86_64_ONLY(x) x
+#define X86_64_DEF(x...) x
+#define CODE64(s) ((s)->code64)
+#define REX_X(s) ((s)->rex_x)
+#define REX_B(s) ((s)->rex_b)
+/* XXX: gcc generates push/pop in some opcodes, so we cannot use them */
+#if 1
+#define BUGGY_64(x) NULL
+#endif
+#else
+#define X86_64_ONLY(x) NULL
+#define X86_64_DEF(x...)
+#define CODE64(s) 0
+#define REX_X(s) 0
+#define REX_B(s) 0
+#endif
+
+#ifdef TARGET_X86_64
+static int x86_64_hregs;
+#endif
+
+#ifdef USE_DIRECT_JUMP
+#define TBPARAM(x)
+#else
+#define TBPARAM(x) (long)(x)
+#endif
+
 typedef struct DisasContext {
     /* current insn context */
     int override; /* -1 if no override */
     int prefix;
     int aflag, dflag;
-    uint8_t *pc; /* pc = eip + cs_base */
+    target_ulong pc; /* pc = eip + cs_base */
     int is_jmp; /* 1 = means jump (stop translation), 2 means CPU
                    static state change (stop translation) */
     /* current block context */
-    uint8_t *cs_base; /* base of CS segment */
+    target_ulong cs_base; /* base of CS segment */
     int pe;     /* protected mode */
     int code32; /* 32 bit code segment */
+#ifdef TARGET_X86_64
+    int lma;    /* long mode active */
+    int code64; /* 64 bit code segment */
+    int rex_x, rex_b;
+#endif
     int ss32;   /* 32 bit stack segment */
     int cc_op;  /* current CC operation */
     int addseg; /* non zero if either DS/ES/SS have a non zero base */
@@ -63,12 +95,17 @@ typedef struct DisasContext {
     int singlestep_enabled; /* "hardware" single step enabled */
     int jmp_opt; /* use direct block chaining for direct jumps */
     int mem_index; /* select memory access functions */
+    int flags; /* all execution flags */
     struct TranslationBlock *tb;
     int popl_esp_hack; /* for correct popl with esp base handling */
+    int rip_offset; /* only used in x86_64, but left for simplicity */
+    int cpuid_features;
+    int cpuid_ext_features;
 } DisasContext;
 
 static void gen_eob(DisasContext *s);
-static void gen_jmp(DisasContext *s, unsigned int eip);
+static void gen_jmp(DisasContext *s, target_ulong eip);
+static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
 
 /* i386 arith/logic operations */
 enum {
@@ -121,108 +158,182 @@ enum {
     OR_EBP,
     OR_ESI,
     OR_EDI,
-    OR_TMP0,    /* temporary operand register */
+
+    OR_TMP0 = 16,    /* temporary operand register */
     OR_TMP1,
     OR_A0, /* temporary register used when doing address evaluation */
-    OR_ZERO, /* fixed zero register */
-    NB_OREGS,
 };
 
-typedef void (GenOpFunc)(void);
-typedef void (GenOpFunc1)(long);
-typedef void (GenOpFunc2)(long, long);
-typedef void (GenOpFunc3)(long, long, long);
-                    
-static GenOpFunc *gen_op_mov_reg_T0[3][8] = {
+#ifdef TARGET_X86_64
+
+#define NB_OP_SIZES 4
+
+#define DEF_REGS(prefix, suffix) \
+  prefix ## EAX ## suffix,\
+  prefix ## ECX ## suffix,\
+  prefix ## EDX ## suffix,\
+  prefix ## EBX ## suffix,\
+  prefix ## ESP ## suffix,\
+  prefix ## EBP ## suffix,\
+  prefix ## ESI ## suffix,\
+  prefix ## EDI ## suffix,\
+  prefix ## R8 ## suffix,\
+  prefix ## R9 ## suffix,\
+  prefix ## R10 ## suffix,\
+  prefix ## R11 ## suffix,\
+  prefix ## R12 ## suffix,\
+  prefix ## R13 ## suffix,\
+  prefix ## R14 ## suffix,\
+  prefix ## R15 ## suffix,
+
+#define DEF_BREGS(prefixb, prefixh, suffix)             \
+                                                        \
+static void prefixb ## ESP ## suffix ## _wrapper(void)  \
+{                                                       \
+    if (x86_64_hregs)                                 \
+        prefixb ## ESP ## suffix ();                    \
+    else                                                \
+        prefixh ## EAX ## suffix ();                    \
+}                                                       \
+                                                        \
+static void prefixb ## EBP ## suffix ## _wrapper(void)  \
+{                                                       \
+    if (x86_64_hregs)                                 \
+        prefixb ## EBP ## suffix ();                    \
+    else                                                \
+        prefixh ## ECX ## suffix ();                    \
+}                                                       \
+                                                        \
+static void prefixb ## ESI ## suffix ## _wrapper(void)  \
+{                                                       \
+    if (x86_64_hregs)                                 \
+        prefixb ## ESI ## suffix ();                    \
+    else                                                \
+        prefixh ## EDX ## suffix ();                    \
+}                                                       \
+                                                        \
+static void prefixb ## EDI ## suffix ## _wrapper(void)  \
+{                                                       \
+    if (x86_64_hregs)                                 \
+        prefixb ## EDI ## suffix ();                    \
+    else                                                \
+        prefixh ## EBX ## suffix ();                    \
+}
+
+DEF_BREGS(gen_op_movb_, gen_op_movh_, _T0)
+DEF_BREGS(gen_op_movb_, gen_op_movh_, _T1)
+DEF_BREGS(gen_op_movl_T0_, gen_op_movh_T0_, )
+DEF_BREGS(gen_op_movl_T1_, gen_op_movh_T1_, )
+
+#else /* !TARGET_X86_64 */
+
+#define NB_OP_SIZES 3
+
+#define DEF_REGS(prefix, suffix) \
+  prefix ## EAX ## suffix,\
+  prefix ## ECX ## suffix,\
+  prefix ## EDX ## suffix,\
+  prefix ## EBX ## suffix,\
+  prefix ## ESP ## suffix,\
+  prefix ## EBP ## suffix,\
+  prefix ## ESI ## suffix,\
+  prefix ## EDI ## suffix,
+
+#endif /* !TARGET_X86_64 */
+
+static GenOpFunc *gen_op_mov_reg_T0[NB_OP_SIZES][CPU_NB_REGS] = {
     [OT_BYTE] = {
         gen_op_movb_EAX_T0,
         gen_op_movb_ECX_T0,
         gen_op_movb_EDX_T0,
         gen_op_movb_EBX_T0,
+#ifdef TARGET_X86_64
+        gen_op_movb_ESP_T0_wrapper,
+        gen_op_movb_EBP_T0_wrapper,
+        gen_op_movb_ESI_T0_wrapper,
+        gen_op_movb_EDI_T0_wrapper,
+        gen_op_movb_R8_T0,
+        gen_op_movb_R9_T0,
+        gen_op_movb_R10_T0,
+        gen_op_movb_R11_T0,
+        gen_op_movb_R12_T0,
+        gen_op_movb_R13_T0,
+        gen_op_movb_R14_T0,
+        gen_op_movb_R15_T0,
+#else
         gen_op_movh_EAX_T0,
         gen_op_movh_ECX_T0,
         gen_op_movh_EDX_T0,
         gen_op_movh_EBX_T0,
+#endif
     },
     [OT_WORD] = {
-        gen_op_movw_EAX_T0,
-        gen_op_movw_ECX_T0,
-        gen_op_movw_EDX_T0,
-        gen_op_movw_EBX_T0,
-        gen_op_movw_ESP_T0,
-        gen_op_movw_EBP_T0,
-        gen_op_movw_ESI_T0,
-        gen_op_movw_EDI_T0,
+        DEF_REGS(gen_op_movw_, _T0)
     },
     [OT_LONG] = {
-        gen_op_movl_EAX_T0,
-        gen_op_movl_ECX_T0,
-        gen_op_movl_EDX_T0,
-        gen_op_movl_EBX_T0,
-        gen_op_movl_ESP_T0,
-        gen_op_movl_EBP_T0,
-        gen_op_movl_ESI_T0,
-        gen_op_movl_EDI_T0,
+        DEF_REGS(gen_op_movl_, _T0)
     },
+#ifdef TARGET_X86_64
+    [OT_QUAD] = {
+        DEF_REGS(gen_op_movq_, _T0)
+    },
+#endif
 };
 
-static GenOpFunc *gen_op_mov_reg_T1[3][8] = {
+static GenOpFunc *gen_op_mov_reg_T1[NB_OP_SIZES][CPU_NB_REGS] = {
     [OT_BYTE] = {
         gen_op_movb_EAX_T1,
         gen_op_movb_ECX_T1,
         gen_op_movb_EDX_T1,
         gen_op_movb_EBX_T1,
+#ifdef TARGET_X86_64
+        gen_op_movb_ESP_T1_wrapper,
+        gen_op_movb_EBP_T1_wrapper,
+        gen_op_movb_ESI_T1_wrapper,
+        gen_op_movb_EDI_T1_wrapper,
+        gen_op_movb_R8_T1,
+        gen_op_movb_R9_T1,
+        gen_op_movb_R10_T1,
+        gen_op_movb_R11_T1,
+        gen_op_movb_R12_T1,
+        gen_op_movb_R13_T1,
+        gen_op_movb_R14_T1,
+        gen_op_movb_R15_T1,
+#else
         gen_op_movh_EAX_T1,
         gen_op_movh_ECX_T1,
         gen_op_movh_EDX_T1,
         gen_op_movh_EBX_T1,
+#endif
     },
     [OT_WORD] = {
-        gen_op_movw_EAX_T1,
-        gen_op_movw_ECX_T1,
-        gen_op_movw_EDX_T1,
-        gen_op_movw_EBX_T1,
-        gen_op_movw_ESP_T1,
-        gen_op_movw_EBP_T1,
-        gen_op_movw_ESI_T1,
-        gen_op_movw_EDI_T1,
+        DEF_REGS(gen_op_movw_, _T1)
     },
     [OT_LONG] = {
-        gen_op_movl_EAX_T1,
-        gen_op_movl_ECX_T1,
-        gen_op_movl_EDX_T1,
-        gen_op_movl_EBX_T1,
-        gen_op_movl_ESP_T1,
-        gen_op_movl_EBP_T1,
-        gen_op_movl_ESI_T1,
-        gen_op_movl_EDI_T1,
+        DEF_REGS(gen_op_movl_, _T1)
+    },
+#ifdef TARGET_X86_64
+    [OT_QUAD] = {
+        DEF_REGS(gen_op_movq_, _T1)
     },
+#endif
 };
 
-static GenOpFunc *gen_op_mov_reg_A0[2][8] = {
+static GenOpFunc *gen_op_mov_reg_A0[NB_OP_SIZES - 1][CPU_NB_REGS] = {
     [0] = {
-        gen_op_movw_EAX_A0,
-        gen_op_movw_ECX_A0,
-        gen_op_movw_EDX_A0,
-        gen_op_movw_EBX_A0,
-        gen_op_movw_ESP_A0,
-        gen_op_movw_EBP_A0,
-        gen_op_movw_ESI_A0,
-        gen_op_movw_EDI_A0,
+        DEF_REGS(gen_op_movw_, _A0)
     },
     [1] = {
-        gen_op_movl_EAX_A0,
-        gen_op_movl_ECX_A0,
-        gen_op_movl_EDX_A0,
-        gen_op_movl_EBX_A0,
-        gen_op_movl_ESP_A0,
-        gen_op_movl_EBP_A0,
-        gen_op_movl_ESI_A0,
-        gen_op_movl_EDI_A0,
+        DEF_REGS(gen_op_movl_, _A0)
+    },
+#ifdef TARGET_X86_64
+    [2] = {
+        DEF_REGS(gen_op_movq_, _A0)
     },
+#endif
 };
 
-static GenOpFunc *gen_op_mov_TN_reg[3][2][8] = 
+static GenOpFunc *gen_op_mov_TN_reg[NB_OP_SIZES][2][CPU_NB_REGS] = 
 {
     [OT_BYTE] = {
         {
@@ -230,143 +341,132 @@ static GenOpFunc *gen_op_mov_TN_reg[3][2][8] =
             gen_op_movl_T0_ECX,
             gen_op_movl_T0_EDX,
             gen_op_movl_T0_EBX,
+#ifdef TARGET_X86_64
+            gen_op_movl_T0_ESP_wrapper,
+            gen_op_movl_T0_EBP_wrapper,
+            gen_op_movl_T0_ESI_wrapper,
+            gen_op_movl_T0_EDI_wrapper,
+            gen_op_movl_T0_R8,
+            gen_op_movl_T0_R9,
+            gen_op_movl_T0_R10,
+            gen_op_movl_T0_R11,
+            gen_op_movl_T0_R12,
+            gen_op_movl_T0_R13,
+            gen_op_movl_T0_R14,
+            gen_op_movl_T0_R15,
+#else
             gen_op_movh_T0_EAX,
             gen_op_movh_T0_ECX,
             gen_op_movh_T0_EDX,
             gen_op_movh_T0_EBX,
+#endif
         },
         {
             gen_op_movl_T1_EAX,
             gen_op_movl_T1_ECX,
             gen_op_movl_T1_EDX,
             gen_op_movl_T1_EBX,
+#ifdef TARGET_X86_64
+            gen_op_movl_T1_ESP_wrapper,
+            gen_op_movl_T1_EBP_wrapper,
+            gen_op_movl_T1_ESI_wrapper,
+            gen_op_movl_T1_EDI_wrapper,
+            gen_op_movl_T1_R8,
+            gen_op_movl_T1_R9,
+            gen_op_movl_T1_R10,
+            gen_op_movl_T1_R11,
+            gen_op_movl_T1_R12,
+            gen_op_movl_T1_R13,
+            gen_op_movl_T1_R14,
+            gen_op_movl_T1_R15,
+#else
             gen_op_movh_T1_EAX,
             gen_op_movh_T1_ECX,
             gen_op_movh_T1_EDX,
             gen_op_movh_T1_EBX,
+#endif
         },
     },
     [OT_WORD] = {
         {
-            gen_op_movl_T0_EAX,
-            gen_op_movl_T0_ECX,
-            gen_op_movl_T0_EDX,
-            gen_op_movl_T0_EBX,
-            gen_op_movl_T0_ESP,
-            gen_op_movl_T0_EBP,
-            gen_op_movl_T0_ESI,
-            gen_op_movl_T0_EDI,
+            DEF_REGS(gen_op_movl_T0_, )
         },
         {
-            gen_op_movl_T1_EAX,
-            gen_op_movl_T1_ECX,
-            gen_op_movl_T1_EDX,
-            gen_op_movl_T1_EBX,
-            gen_op_movl_T1_ESP,
-            gen_op_movl_T1_EBP,
-            gen_op_movl_T1_ESI,
-            gen_op_movl_T1_EDI,
+            DEF_REGS(gen_op_movl_T1_, )
         },
     },
     [OT_LONG] = {
         {
-            gen_op_movl_T0_EAX,
-            gen_op_movl_T0_ECX,
-            gen_op_movl_T0_EDX,
-            gen_op_movl_T0_EBX,
-            gen_op_movl_T0_ESP,
-            gen_op_movl_T0_EBP,
-            gen_op_movl_T0_ESI,
-            gen_op_movl_T0_EDI,
+            DEF_REGS(gen_op_movl_T0_, )
         },
         {
-            gen_op_movl_T1_EAX,
-            gen_op_movl_T1_ECX,
-            gen_op_movl_T1_EDX,
-            gen_op_movl_T1_EBX,
-            gen_op_movl_T1_ESP,
-            gen_op_movl_T1_EBP,
-            gen_op_movl_T1_ESI,
-            gen_op_movl_T1_EDI,
+            DEF_REGS(gen_op_movl_T1_, )
+        },
+    },
+#ifdef TARGET_X86_64
+    [OT_QUAD] = {
+        {
+            DEF_REGS(gen_op_movl_T0_, )
+        },
+        {
+            DEF_REGS(gen_op_movl_T1_, )
         },
     },
+#endif
+};
+
+static GenOpFunc *gen_op_movl_A0_reg[CPU_NB_REGS] = {
+    DEF_REGS(gen_op_movl_A0_, )
+};
+
+static GenOpFunc *gen_op_addl_A0_reg_sN[4][CPU_NB_REGS] = {
+    [0] = {
+        DEF_REGS(gen_op_addl_A0_, )
+    },
+    [1] = {
+        DEF_REGS(gen_op_addl_A0_, _s1)
+    },
+    [2] = {
+        DEF_REGS(gen_op_addl_A0_, _s2)
+    },
+    [3] = {
+        DEF_REGS(gen_op_addl_A0_, _s3)
+    },
 };
 
-static GenOpFunc *gen_op_movl_A0_reg[8] = {
-    gen_op_movl_A0_EAX,
-    gen_op_movl_A0_ECX,
-    gen_op_movl_A0_EDX,
-    gen_op_movl_A0_EBX,
-    gen_op_movl_A0_ESP,
-    gen_op_movl_A0_EBP,
-    gen_op_movl_A0_ESI,
-    gen_op_movl_A0_EDI,
+#ifdef TARGET_X86_64
+static GenOpFunc *gen_op_movq_A0_reg[CPU_NB_REGS] = {
+    DEF_REGS(gen_op_movq_A0_, )
 };
 
-static GenOpFunc *gen_op_addl_A0_reg_sN[4][8] = {
+static GenOpFunc *gen_op_addq_A0_reg_sN[4][CPU_NB_REGS] = {
     [0] = {
-        gen_op_addl_A0_EAX,
-        gen_op_addl_A0_ECX,
-        gen_op_addl_A0_EDX,
-        gen_op_addl_A0_EBX,
-        gen_op_addl_A0_ESP,
-        gen_op_addl_A0_EBP,
-        gen_op_addl_A0_ESI,
-        gen_op_addl_A0_EDI,
+        DEF_REGS(gen_op_addq_A0_, )
     },
     [1] = {
-        gen_op_addl_A0_EAX_s1,
-        gen_op_addl_A0_ECX_s1,
-        gen_op_addl_A0_EDX_s1,
-        gen_op_addl_A0_EBX_s1,
-        gen_op_addl_A0_ESP_s1,
-        gen_op_addl_A0_EBP_s1,
-        gen_op_addl_A0_ESI_s1,
-        gen_op_addl_A0_EDI_s1,
+        DEF_REGS(gen_op_addq_A0_, _s1)
     },
     [2] = {
-        gen_op_addl_A0_EAX_s2,
-        gen_op_addl_A0_ECX_s2,
-        gen_op_addl_A0_EDX_s2,
-        gen_op_addl_A0_EBX_s2,
-        gen_op_addl_A0_ESP_s2,
-        gen_op_addl_A0_EBP_s2,
-        gen_op_addl_A0_ESI_s2,
-        gen_op_addl_A0_EDI_s2,
+        DEF_REGS(gen_op_addq_A0_, _s2)
     },
     [3] = {
-        gen_op_addl_A0_EAX_s3,
-        gen_op_addl_A0_ECX_s3,
-        gen_op_addl_A0_EDX_s3,
-        gen_op_addl_A0_EBX_s3,
-        gen_op_addl_A0_ESP_s3,
-        gen_op_addl_A0_EBP_s3,
-        gen_op_addl_A0_ESI_s3,
-        gen_op_addl_A0_EDI_s3,
+        DEF_REGS(gen_op_addq_A0_, _s3)
     },
 };
+#endif
 
-static GenOpFunc *gen_op_cmov_reg_T1_T0[2][8] = {
+static GenOpFunc *gen_op_cmov_reg_T1_T0[NB_OP_SIZES - 1][CPU_NB_REGS] = {
     [0] = {
-        gen_op_cmovw_EAX_T1_T0,
-        gen_op_cmovw_ECX_T1_T0,
-        gen_op_cmovw_EDX_T1_T0,
-        gen_op_cmovw_EBX_T1_T0,
-        gen_op_cmovw_ESP_T1_T0,
-        gen_op_cmovw_EBP_T1_T0,
-        gen_op_cmovw_ESI_T1_T0,
-        gen_op_cmovw_EDI_T1_T0,
+        DEF_REGS(gen_op_cmovw_, _T1_T0)
     },
     [1] = {
-        gen_op_cmovl_EAX_T1_T0,
-        gen_op_cmovl_ECX_T1_T0,
-        gen_op_cmovl_EDX_T1_T0,
-        gen_op_cmovl_EBX_T1_T0,
-        gen_op_cmovl_ESP_T1_T0,
-        gen_op_cmovl_EBP_T1_T0,
-        gen_op_cmovl_ESI_T1_T0,
-        gen_op_cmovl_EDI_T1_T0,
+        DEF_REGS(gen_op_cmovl_, _T1_T0)
+    },
+#ifdef TARGET_X86_64
+    [2] = {
+        DEF_REGS(gen_op_cmovq_, _T1_T0)
     },
+#endif
 };
 
 static GenOpFunc *gen_op_arith_T0_T1_cc[8] = {
@@ -380,34 +480,34 @@ static GenOpFunc *gen_op_arith_T0_T1_cc[8] = {
     NULL,
 };
 
-static GenOpFunc *gen_op_arithc_T0_T1_cc[3][2] = {
-    [OT_BYTE] = {
-        gen_op_adcb_T0_T1_cc,
-        gen_op_sbbb_T0_T1_cc,
-    },
-    [OT_WORD] = {
-        gen_op_adcw_T0_T1_cc,
-        gen_op_sbbw_T0_T1_cc,
-    },
-    [OT_LONG] = {
-        gen_op_adcl_T0_T1_cc,
-        gen_op_sbbl_T0_T1_cc,
+#define DEF_ARITHC(SUFFIX)\
+    {\
+        gen_op_adcb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sbbb ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        gen_op_adcw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sbbw ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        gen_op_adcl ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sbbl ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        X86_64_ONLY(gen_op_adcq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_sbbq ## SUFFIX ## _T0_T1_cc),\
     },
+
+static GenOpFunc *gen_op_arithc_T0_T1_cc[4][2] = {
+    DEF_ARITHC( )
 };
 
-static GenOpFunc *gen_op_arithc_mem_T0_T1_cc[3][2] = {
-    [OT_BYTE] = {
-        gen_op_adcb_mem_T0_T1_cc,
-        gen_op_sbbb_mem_T0_T1_cc,
-    },
-    [OT_WORD] = {
-        gen_op_adcw_mem_T0_T1_cc,
-        gen_op_sbbw_mem_T0_T1_cc,
-    },
-    [OT_LONG] = {
-        gen_op_adcl_mem_T0_T1_cc,
-        gen_op_sbbl_mem_T0_T1_cc,
-    },
+static GenOpFunc *gen_op_arithc_mem_T0_T1_cc[3 * 4][2] = {
+    DEF_ARITHC(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_ARITHC(_kernel)
+    DEF_ARITHC(_user)
+#endif
 };
 
 static const int cc_op_arithb[8] = {
@@ -421,129 +521,121 @@ static const int cc_op_arithb[8] = {
     CC_OP_SUBB,
 };
 
-static GenOpFunc *gen_op_cmpxchg_T0_T1_EAX_cc[3] = {
-    gen_op_cmpxchgb_T0_T1_EAX_cc,
-    gen_op_cmpxchgw_T0_T1_EAX_cc,
-    gen_op_cmpxchgl_T0_T1_EAX_cc,
+#define DEF_CMPXCHG(SUFFIX)\
+    gen_op_cmpxchgb ## SUFFIX ## _T0_T1_EAX_cc,\
+    gen_op_cmpxchgw ## SUFFIX ## _T0_T1_EAX_cc,\
+    gen_op_cmpxchgl ## SUFFIX ## _T0_T1_EAX_cc,\
+    X86_64_ONLY(gen_op_cmpxchgq ## SUFFIX ## _T0_T1_EAX_cc),
+
+static GenOpFunc *gen_op_cmpxchg_T0_T1_EAX_cc[4] = {
+    DEF_CMPXCHG( )
 };
 
-static GenOpFunc *gen_op_cmpxchg_mem_T0_T1_EAX_cc[3] = {
-    gen_op_cmpxchgb_mem_T0_T1_EAX_cc,
-    gen_op_cmpxchgw_mem_T0_T1_EAX_cc,
-    gen_op_cmpxchgl_mem_T0_T1_EAX_cc,
+static GenOpFunc *gen_op_cmpxchg_mem_T0_T1_EAX_cc[3 * 4] = {
+    DEF_CMPXCHG(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_CMPXCHG(_kernel)
+    DEF_CMPXCHG(_user)
+#endif
 };
 
-static GenOpFunc *gen_op_shift_T0_T1_cc[3][8] = {
-    [OT_BYTE] = {
-        gen_op_rolb_T0_T1_cc,
-        gen_op_rorb_T0_T1_cc,
-        gen_op_rclb_T0_T1_cc,
-        gen_op_rcrb_T0_T1_cc,
-        gen_op_shlb_T0_T1_cc,
-        gen_op_shrb_T0_T1_cc,
-        gen_op_shlb_T0_T1_cc,
-        gen_op_sarb_T0_T1_cc,
-    },
-    [OT_WORD] = {
-        gen_op_rolw_T0_T1_cc,
-        gen_op_rorw_T0_T1_cc,
-        gen_op_rclw_T0_T1_cc,
-        gen_op_rcrw_T0_T1_cc,
-        gen_op_shlw_T0_T1_cc,
-        gen_op_shrw_T0_T1_cc,
-        gen_op_shlw_T0_T1_cc,
-        gen_op_sarw_T0_T1_cc,
-    },
-    [OT_LONG] = {
-        gen_op_roll_T0_T1_cc,
-        gen_op_rorl_T0_T1_cc,
-        gen_op_rcll_T0_T1_cc,
-        gen_op_rcrl_T0_T1_cc,
-        gen_op_shll_T0_T1_cc,
-        gen_op_shrl_T0_T1_cc,
-        gen_op_shll_T0_T1_cc,
-        gen_op_sarl_T0_T1_cc,
+#define DEF_SHIFT(SUFFIX)\
+    {\
+        gen_op_rolb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rorb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rclb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rcrb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shlb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shrb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shlb ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sarb ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        gen_op_rolw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rorw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rclw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rcrw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shlw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shrw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shlw ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sarw ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        gen_op_roll ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rorl ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rcll ## SUFFIX ## _T0_T1_cc,\
+        gen_op_rcrl ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shll ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shrl ## SUFFIX ## _T0_T1_cc,\
+        gen_op_shll ## SUFFIX ## _T0_T1_cc,\
+        gen_op_sarl ## SUFFIX ## _T0_T1_cc,\
+    },\
+    {\
+        X86_64_ONLY(gen_op_rolq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_rorq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_rclq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_rcrq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_shlq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_shrq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_shlq ## SUFFIX ## _T0_T1_cc),\
+        X86_64_ONLY(gen_op_sarq ## SUFFIX ## _T0_T1_cc),\
     },
+
+static GenOpFunc *gen_op_shift_T0_T1_cc[4][8] = {
+    DEF_SHIFT( )
 };
 
-static GenOpFunc *gen_op_shift_mem_T0_T1_cc[3][8] = {
-    [OT_BYTE] = {
-        gen_op_rolb_mem_T0_T1_cc,
-        gen_op_rorb_mem_T0_T1_cc,
-        gen_op_rclb_mem_T0_T1_cc,
-        gen_op_rcrb_mem_T0_T1_cc,
-        gen_op_shlb_mem_T0_T1_cc,
-        gen_op_shrb_mem_T0_T1_cc,
-        gen_op_shlb_mem_T0_T1_cc,
-        gen_op_sarb_mem_T0_T1_cc,
-    },
-    [OT_WORD] = {
-        gen_op_rolw_mem_T0_T1_cc,
-        gen_op_rorw_mem_T0_T1_cc,
-        gen_op_rclw_mem_T0_T1_cc,
-        gen_op_rcrw_mem_T0_T1_cc,
-        gen_op_shlw_mem_T0_T1_cc,
-        gen_op_shrw_mem_T0_T1_cc,
-        gen_op_shlw_mem_T0_T1_cc,
-        gen_op_sarw_mem_T0_T1_cc,
-    },
-    [OT_LONG] = {
-        gen_op_roll_mem_T0_T1_cc,
-        gen_op_rorl_mem_T0_T1_cc,
-        gen_op_rcll_mem_T0_T1_cc,
-        gen_op_rcrl_mem_T0_T1_cc,
-        gen_op_shll_mem_T0_T1_cc,
-        gen_op_shrl_mem_T0_T1_cc,
-        gen_op_shll_mem_T0_T1_cc,
-        gen_op_sarl_mem_T0_T1_cc,
-    },
+static GenOpFunc *gen_op_shift_mem_T0_T1_cc[3 * 4][8] = {
+    DEF_SHIFT(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_SHIFT(_kernel)
+    DEF_SHIFT(_user)
+#endif
 };
 
-static GenOpFunc1 *gen_op_shiftd_T0_T1_im_cc[2][2] = {
-    [0] = {
-        gen_op_shldw_T0_T1_im_cc,
-        gen_op_shrdw_T0_T1_im_cc,
-    },
-    [1] = {
-        gen_op_shldl_T0_T1_im_cc,
-        gen_op_shrdl_T0_T1_im_cc,
+#define DEF_SHIFTD(SUFFIX, op)\
+    {\
+        NULL,\
+        NULL,\
+    },\
+    {\
+        gen_op_shldw ## SUFFIX ## _T0_T1_ ## op ## _cc,\
+        gen_op_shrdw ## SUFFIX ## _T0_T1_ ## op ## _cc,\
+     },\
+    {\
+        gen_op_shldl ## SUFFIX ## _T0_T1_ ## op ## _cc,\
+        gen_op_shrdl ## SUFFIX ## _T0_T1_ ## op ## _cc,\
+    },\
+    {\
+X86_64_DEF(gen_op_shldq ## SUFFIX ## _T0_T1_ ## op ## _cc,\
+           gen_op_shrdq ## SUFFIX ## _T0_T1_ ## op ## _cc,)\
     },
+
+static GenOpFunc1 *gen_op_shiftd_T0_T1_im_cc[4][2] = {
+    DEF_SHIFTD(, im)
 };
 
-static GenOpFunc *gen_op_shiftd_T0_T1_ECX_cc[2][2] = {
-    [0] = {
-        gen_op_shldw_T0_T1_ECX_cc,
-        gen_op_shrdw_T0_T1_ECX_cc,
-    },
-    [1] = {
-        gen_op_shldl_T0_T1_ECX_cc,
-        gen_op_shrdl_T0_T1_ECX_cc,
-    },
+static GenOpFunc *gen_op_shiftd_T0_T1_ECX_cc[4][2] = {
+    DEF_SHIFTD(, ECX)
 };
 
-static GenOpFunc1 *gen_op_shiftd_mem_T0_T1_im_cc[2][2] = {
-    [0] = {
-        gen_op_shldw_mem_T0_T1_im_cc,
-        gen_op_shrdw_mem_T0_T1_im_cc,
-    },
-    [1] = {
-        gen_op_shldl_mem_T0_T1_im_cc,
-        gen_op_shrdl_mem_T0_T1_im_cc,
-    },
+static GenOpFunc1 *gen_op_shiftd_mem_T0_T1_im_cc[3 * 4][2] = {
+    DEF_SHIFTD(_raw, im)
+#ifndef CONFIG_USER_ONLY
+    DEF_SHIFTD(_kernel, im)
+    DEF_SHIFTD(_user, im)
+#endif
 };
 
-static GenOpFunc *gen_op_shiftd_mem_T0_T1_ECX_cc[2][2] = {
-    [0] = {
-        gen_op_shldw_mem_T0_T1_ECX_cc,
-        gen_op_shrdw_mem_T0_T1_ECX_cc,
-    },
-    [1] = {
-        gen_op_shldl_mem_T0_T1_ECX_cc,
-        gen_op_shrdl_mem_T0_T1_ECX_cc,
-    },
+static GenOpFunc *gen_op_shiftd_mem_T0_T1_ECX_cc[3 * 4][2] = {
+    DEF_SHIFTD(_raw, ECX)
+#ifndef CONFIG_USER_ONLY
+    DEF_SHIFTD(_kernel, ECX)
+    DEF_SHIFTD(_user, ECX)
+#endif
 };
 
-static GenOpFunc *gen_op_btx_T0_T1_cc[2][4] = {
+static GenOpFunc *gen_op_btx_T0_T1_cc[3][4] = {
     [0] = {
         gen_op_btw_T0_T1_cc,
         gen_op_btsw_T0_T1_cc,
@@ -556,9 +648,23 @@ static GenOpFunc *gen_op_btx_T0_T1_cc[2][4] = {
         gen_op_btrl_T0_T1_cc,
         gen_op_btcl_T0_T1_cc,
     },
+#ifdef TARGET_X86_64
+    [2] = {
+        gen_op_btq_T0_T1_cc,
+        gen_op_btsq_T0_T1_cc,
+        gen_op_btrq_T0_T1_cc,
+        gen_op_btcq_T0_T1_cc,
+    },
+#endif
 };
 
-static GenOpFunc *gen_op_bsx_T0_cc[2][2] = {
+static GenOpFunc *gen_op_add_bit_A0_T1[3] = {
+    gen_op_add_bitw_A0_T1,
+    gen_op_add_bitl_A0_T1,
+    X86_64_ONLY(gen_op_add_bitq_A0_T1),
+};
+
+static GenOpFunc *gen_op_bsx_T0_cc[3][2] = {
     [0] = {
         gen_op_bsfw_T0_cc,
         gen_op_bsrw_T0_cc,
@@ -567,93 +673,158 @@ static GenOpFunc *gen_op_bsx_T0_cc[2][2] = {
         gen_op_bsfl_T0_cc,
         gen_op_bsrl_T0_cc,
     },
+#ifdef TARGET_X86_64
+    [2] = {
+        gen_op_bsfq_T0_cc,
+        gen_op_bsrq_T0_cc,
+    },
+#endif
 };
 
-static GenOpFunc *gen_op_lds_T0_A0[3 * 3] = {
+static GenOpFunc *gen_op_lds_T0_A0[3 * 4] = {
     gen_op_ldsb_raw_T0_A0,
     gen_op_ldsw_raw_T0_A0,
+    X86_64_ONLY(gen_op_ldsl_raw_T0_A0),
     NULL,
 #ifndef CONFIG_USER_ONLY
     gen_op_ldsb_kernel_T0_A0,
     gen_op_ldsw_kernel_T0_A0,
+    X86_64_ONLY(gen_op_ldsl_kernel_T0_A0),
     NULL,
 
     gen_op_ldsb_user_T0_A0,
     gen_op_ldsw_user_T0_A0,
+    X86_64_ONLY(gen_op_ldsl_user_T0_A0),
     NULL,
 #endif
 };
 
-static GenOpFunc *gen_op_ldu_T0_A0[3 * 3] = {
+static GenOpFunc *gen_op_ldu_T0_A0[3 * 4] = {
     gen_op_ldub_raw_T0_A0,
     gen_op_lduw_raw_T0_A0,
     NULL,
+    NULL,
 
 #ifndef CONFIG_USER_ONLY
     gen_op_ldub_kernel_T0_A0,
     gen_op_lduw_kernel_T0_A0,
     NULL,
+    NULL,
 
     gen_op_ldub_user_T0_A0,
     gen_op_lduw_user_T0_A0,
     NULL,
+    NULL,
 #endif
 };
 
 /* sign does not matter, except for lidt/lgdt call (TODO: fix it) */
-static GenOpFunc *gen_op_ld_T0_A0[3 * 3] = {
+static GenOpFunc *gen_op_ld_T0_A0[3 * 4] = {
     gen_op_ldub_raw_T0_A0,
     gen_op_lduw_raw_T0_A0,
     gen_op_ldl_raw_T0_A0,
+    X86_64_ONLY(gen_op_ldq_raw_T0_A0),
 
 #ifndef CONFIG_USER_ONLY
     gen_op_ldub_kernel_T0_A0,
     gen_op_lduw_kernel_T0_A0,
     gen_op_ldl_kernel_T0_A0,
+    X86_64_ONLY(gen_op_ldq_kernel_T0_A0),
 
     gen_op_ldub_user_T0_A0,
     gen_op_lduw_user_T0_A0,
     gen_op_ldl_user_T0_A0,
+    X86_64_ONLY(gen_op_ldq_user_T0_A0),
 #endif
 };
 
-static GenOpFunc *gen_op_ld_T1_A0[3 * 3] = {
+static GenOpFunc *gen_op_ld_T1_A0[3 * 4] = {
     gen_op_ldub_raw_T1_A0,
     gen_op_lduw_raw_T1_A0,
     gen_op_ldl_raw_T1_A0,
+    X86_64_ONLY(gen_op_ldq_raw_T1_A0),
 
 #ifndef CONFIG_USER_ONLY
     gen_op_ldub_kernel_T1_A0,
     gen_op_lduw_kernel_T1_A0,
     gen_op_ldl_kernel_T1_A0,
+    X86_64_ONLY(gen_op_ldq_kernel_T1_A0),
 
     gen_op_ldub_user_T1_A0,
     gen_op_lduw_user_T1_A0,
     gen_op_ldl_user_T1_A0,
+    X86_64_ONLY(gen_op_ldq_user_T1_A0),
 #endif
 };
 
-static GenOpFunc *gen_op_st_T0_A0[3 * 3] = {
+static GenOpFunc *gen_op_st_T0_A0[3 * 4] = {
     gen_op_stb_raw_T0_A0,
     gen_op_stw_raw_T0_A0,
     gen_op_stl_raw_T0_A0,
+    X86_64_ONLY(gen_op_stq_raw_T0_A0),
 
 #ifndef CONFIG_USER_ONLY
     gen_op_stb_kernel_T0_A0,
     gen_op_stw_kernel_T0_A0,
     gen_op_stl_kernel_T0_A0,
+    X86_64_ONLY(gen_op_stq_kernel_T0_A0),
 
     gen_op_stb_user_T0_A0,
     gen_op_stw_user_T0_A0,
     gen_op_stl_user_T0_A0,
+    X86_64_ONLY(gen_op_stq_user_T0_A0),
 #endif
 };
 
+static GenOpFunc *gen_op_st_T1_A0[3 * 4] = {
+    NULL,
+    gen_op_stw_raw_T1_A0,
+    gen_op_stl_raw_T1_A0,
+    X86_64_ONLY(gen_op_stq_raw_T1_A0),
+
+#ifndef CONFIG_USER_ONLY
+    NULL,
+    gen_op_stw_kernel_T1_A0,
+    gen_op_stl_kernel_T1_A0,
+    X86_64_ONLY(gen_op_stq_kernel_T1_A0),
+
+    NULL,
+    gen_op_stw_user_T1_A0,
+    gen_op_stl_user_T1_A0,
+    X86_64_ONLY(gen_op_stq_user_T1_A0),
+#endif
+};
+
+static inline void gen_jmp_im(target_ulong pc)
+{
+#ifdef TARGET_X86_64
+    if (pc == (uint32_t)pc) {
+        gen_op_movl_eip_im(pc);
+    } else if (pc == (int32_t)pc) {
+        gen_op_movq_eip_im(pc);
+    } else {
+        gen_op_movq_eip_im64(pc >> 32, pc);
+    }
+#else
+    gen_op_movl_eip_im(pc);
+#endif
+}
+
 static inline void gen_string_movl_A0_ESI(DisasContext *s)
 {
     int override;
 
     override = s->override;
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        if (override >= 0) {
+            gen_op_movq_A0_seg(offsetof(CPUX86State,segs[override].base));
+            gen_op_addq_A0_reg_sN[0][R_ESI]();
+        } else {
+            gen_op_movq_A0_reg[R_ESI]();
+        }
+    } else
+#endif
     if (s->aflag) {
         /* 32 bit address */
         if (s->addseg && override < 0)
@@ -676,6 +847,11 @@ static inline void gen_string_movl_A0_ESI(DisasContext *s)
 
 static inline void gen_string_movl_A0_EDI(DisasContext *s)
 {
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_movq_A0_reg[R_EDI]();
+    } else
+#endif
     if (s->aflag) {
         if (s->addseg) {
             gen_op_movl_A0_seg(offsetof(CPUX86State,segs[R_ES].base));
@@ -690,50 +866,43 @@ static inline void gen_string_movl_A0_EDI(DisasContext *s)
     }
 }
 
-static GenOpFunc *gen_op_movl_T0_Dshift[3] = {
+static GenOpFunc *gen_op_movl_T0_Dshift[4] = {
     gen_op_movl_T0_Dshiftb,
     gen_op_movl_T0_Dshiftw,
     gen_op_movl_T0_Dshiftl,
+    X86_64_ONLY(gen_op_movl_T0_Dshiftq),
 };
 
-static GenOpFunc2 *gen_op_jz_ecx[2] = {
-    gen_op_jz_ecxw,
-    gen_op_jz_ecxl,
+static GenOpFunc1 *gen_op_jnz_ecx[3] = {
+    gen_op_jnz_ecxw,
+    gen_op_jnz_ecxl,
+    X86_64_ONLY(gen_op_jnz_ecxq),
 };
     
-static GenOpFunc1 *gen_op_jz_ecx_im[2] = {
-    gen_op_jz_ecxw_im,
-    gen_op_jz_ecxl_im,
+static GenOpFunc1 *gen_op_jz_ecx[3] = {
+    gen_op_jz_ecxw,
+    gen_op_jz_ecxl,
+    X86_64_ONLY(gen_op_jz_ecxq),
 };
 
-static GenOpFunc *gen_op_dec_ECX[2] = {
+static GenOpFunc *gen_op_dec_ECX[3] = {
     gen_op_decw_ECX,
     gen_op_decl_ECX,
+    X86_64_ONLY(gen_op_decq_ECX),
 };
 
-static GenOpFunc1 *gen_op_string_jnz_sub[2][3] = {
+static GenOpFunc1 *gen_op_string_jnz_sub[2][4] = {
     {
-        gen_op_string_jnz_subb,
-        gen_op_string_jnz_subw,
-        gen_op_string_jnz_subl,
+        gen_op_jnz_subb,
+        gen_op_jnz_subw,
+        gen_op_jnz_subl,
+        X86_64_ONLY(gen_op_jnz_subq),
     },
     {
-        gen_op_string_jz_subb,
-        gen_op_string_jz_subw,
-        gen_op_string_jz_subl,
-    },
-};
-
-static GenOpFunc1 *gen_op_string_jnz_sub_im[2][3] = {
-    {
-        gen_op_string_jnz_subb_im,
-        gen_op_string_jnz_subw_im,
-        gen_op_string_jnz_subl_im,
-    },
-    {
-        gen_op_string_jz_subb_im,
-        gen_op_string_jz_subw_im,
-        gen_op_string_jz_subl_im,
+        gen_op_jz_subb,
+        gen_op_jz_subw,
+        gen_op_jz_subl,
+        X86_64_ONLY(gen_op_jz_subq),
     },
 };
 
@@ -773,12 +942,12 @@ static GenOpFunc *gen_check_io_DX[3] = {
     gen_op_check_iol_DX,
 };
 
-static void gen_check_io(DisasContext *s, int ot, int use_dx, int cur_eip)
+static void gen_check_io(DisasContext *s, int ot, int use_dx, target_ulong cur_eip)
 {
     if (s->pe && (s->cpl > s->iopl || s->vm86)) {
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
-        gen_op_jmp_im(cur_eip);
+        gen_jmp_im(cur_eip);
         if (use_dx)
             gen_check_io_DX[ot]();
         else
@@ -793,6 +962,12 @@ static inline void gen_movs(DisasContext *s, int ot)
     gen_string_movl_A0_EDI(s);
     gen_op_st_T0_A0[ot + s->mem_index]();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_ESI_T0();
+        gen_op_addq_EDI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_ESI_T0();
         gen_op_addl_EDI_T0();
@@ -810,15 +985,19 @@ static inline void gen_update_cc_op(DisasContext *s)
     }
 }
 
-static inline void gen_jz_ecx_string(DisasContext *s, unsigned int next_eip)
+/* XXX: does not work with gdbstub "ice" single step - not a
+   serious problem */
+static int gen_jz_ecx_string(DisasContext *s, target_ulong next_eip)
 {
-    if (s->jmp_opt) {
-        gen_op_jz_ecx[s->aflag]((long)s->tb, next_eip);
-    } else {
-        /* XXX: does not work with gdbstub "ice" single step - not a
-           serious problem */
-        gen_op_jz_ecx_im[s->aflag](next_eip);
-    }
+    int l1, l2;
+
+    l1 = gen_new_label();
+    l2 = gen_new_label();
+    gen_op_jnz_ecx[s->aflag](l1);
+    gen_set_label(l2);
+    gen_jmp_tb(s, next_eip, 1);
+    gen_set_label(l1);
+    return l2;
 }
 
 static inline void gen_stos(DisasContext *s, int ot)
@@ -827,6 +1006,11 @@ static inline void gen_stos(DisasContext *s, int ot)
     gen_string_movl_A0_EDI(s);
     gen_op_st_T0_A0[ot + s->mem_index]();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_EDI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_EDI_T0();
     } else {
@@ -840,6 +1024,11 @@ static inline void gen_lods(DisasContext *s, int ot)
     gen_op_ld_T0_A0[ot + s->mem_index]();
     gen_op_mov_reg_T0[ot][R_EAX]();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_ESI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_ESI_T0();
     } else {
@@ -854,6 +1043,11 @@ static inline void gen_scas(DisasContext *s, int ot)
     gen_op_ld_T1_A0[ot + s->mem_index]();
     gen_op_cmpl_T0_T1_cc();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_EDI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_EDI_T0();
     } else {
@@ -869,6 +1063,12 @@ static inline void gen_cmps(DisasContext *s, int ot)
     gen_op_ld_T1_A0[ot + s->mem_index]();
     gen_op_cmpl_T0_T1_cc();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_ESI_T0();
+        gen_op_addq_EDI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_ESI_T0();
         gen_op_addl_EDI_T0();
@@ -880,10 +1080,17 @@ static inline void gen_cmps(DisasContext *s, int ot)
 
 static inline void gen_ins(DisasContext *s, int ot)
 {
-    gen_op_in_DX_T0[ot]();
     gen_string_movl_A0_EDI(s);
+    gen_op_movl_T0_0();
+    gen_op_st_T0_A0[ot + s->mem_index]();
+    gen_op_in_DX_T0[ot]();
     gen_op_st_T0_A0[ot + s->mem_index]();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_EDI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_EDI_T0();
     } else {
@@ -897,6 +1104,11 @@ static inline void gen_outs(DisasContext *s, int ot)
     gen_op_ld_T0_A0[ot + s->mem_index]();
     gen_op_out_DX_T0[ot]();
     gen_op_movl_T0_Dshift[ot]();
+#ifdef TARGET_X86_64
+    if (s->aflag == 2) {
+        gen_op_addq_ESI_T0();
+    } else 
+#endif
     if (s->aflag) {
         gen_op_addl_ESI_T0();
     } else {
@@ -908,36 +1120,35 @@ static inline void gen_outs(DisasContext *s, int ot)
    instruction */
 #define GEN_REPZ(op)                                                          \
 static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
-                                 unsigned int cur_eip, unsigned int next_eip) \
+                                 target_ulong cur_eip, target_ulong next_eip) \
 {                                                                             \
+    int l2;\
     gen_update_cc_op(s);                                                      \
-    gen_jz_ecx_string(s, next_eip);                                           \
+    l2 = gen_jz_ecx_string(s, next_eip);                                      \
     gen_ ## op(s, ot);                                                        \
     gen_op_dec_ECX[s->aflag]();                                               \
     /* a loop would cause two single step exceptions if ECX = 1               \
        before rep string_insn */                                              \
     if (!s->jmp_opt)                                                          \
-        gen_op_jz_ecx_im[s->aflag](next_eip);                                 \
+        gen_op_jz_ecx[s->aflag](l2);                                          \
     gen_jmp(s, cur_eip);                                                      \
 }
 
 #define GEN_REPZ2(op)                                                         \
 static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
-                                   unsigned int cur_eip,                      \
-                                   unsigned int next_eip,                     \
+                                   target_ulong cur_eip,                      \
+                                   target_ulong next_eip,                     \
                                    int nz)                                    \
 {                                                                             \
+    int l2;\
     gen_update_cc_op(s);                                                      \
-    gen_jz_ecx_string(s, next_eip);                                           \
+    l2 = gen_jz_ecx_string(s, next_eip);                                      \
     gen_ ## op(s, ot);                                                        \
     gen_op_dec_ECX[s->aflag]();                                               \
     gen_op_set_cc_op(CC_OP_SUBB + ot);                                        \
+    gen_op_string_jnz_sub[nz][ot](l2);\
     if (!s->jmp_opt)                                                          \
-        gen_op_string_jnz_sub_im[nz][ot](next_eip);                           \
-    else                                                                      \
-        gen_op_string_jnz_sub[nz][ot]((long)s->tb);                           \
-    if (!s->jmp_opt)                                                          \
-        gen_op_jz_ecx_im[s->aflag](next_eip);                                 \
+        gen_op_jz_ecx[s->aflag](l2);                                          \
     gen_jmp(s, cur_eip);                                                      \
 }
 
@@ -960,7 +1171,7 @@ enum {
     JCC_LE,
 };
 
-static GenOpFunc3 *gen_jcc_sub[3][8] = {
+static GenOpFunc1 *gen_jcc_sub[4][8] = {
     [OT_BYTE] = {
         NULL,
         gen_op_jb_subb,
@@ -991,20 +1202,37 @@ static GenOpFunc3 *gen_jcc_sub[3][8] = {
         gen_op_jl_subl,
         gen_op_jle_subl,
     },
+#ifdef TARGET_X86_64
+    [OT_QUAD] = {
+        NULL,
+        BUGGY_64(gen_op_jb_subq),
+        gen_op_jz_subq,
+        BUGGY_64(gen_op_jbe_subq),
+        gen_op_js_subq,
+        NULL,
+        BUGGY_64(gen_op_jl_subq),
+        BUGGY_64(gen_op_jle_subq),
+    },
+#endif
 };
-static GenOpFunc2 *gen_op_loop[2][4] = {
+static GenOpFunc1 *gen_op_loop[3][4] = {
     [0] = {
         gen_op_loopnzw,
         gen_op_loopzw,
-        gen_op_loopw,
-        gen_op_jecxzw,
+        gen_op_jnz_ecxw,
     },
     [1] = {
         gen_op_loopnzl,
         gen_op_loopzl,
-        gen_op_loopl,
-        gen_op_jecxzl,
+        gen_op_jnz_ecxl,
+    },
+#ifdef TARGET_X86_64
+    [2] = {
+        gen_op_loopnzq,
+        gen_op_loopzq,
+        gen_op_jnz_ecxq,
     },
+#endif
 };
 
 static GenOpFunc *gen_setcc_slow[8] = {
@@ -1018,7 +1246,7 @@ static GenOpFunc *gen_setcc_slow[8] = {
     gen_op_setle_T0_cc,
 };
 
-static GenOpFunc *gen_setcc_sub[3][8] = {
+static GenOpFunc *gen_setcc_sub[4][8] = {
     [OT_BYTE] = {
         NULL,
         gen_op_setb_T0_subb,
@@ -1049,6 +1277,18 @@ static GenOpFunc *gen_setcc_sub[3][8] = {
         gen_op_setl_T0_subl,
         gen_op_setle_T0_subl,
     },
+#ifdef TARGET_X86_64
+    [OT_QUAD] = {
+        NULL,
+        gen_op_setb_T0_subq,
+        gen_op_setz_T0_subq,
+        gen_op_setbe_T0_subq,
+        gen_op_sets_T0_subq,
+        NULL,
+        gen_op_setl_T0_subq,
+        gen_op_setle_T0_subq,
+    },
+#endif
 };
 
 static GenOpFunc *gen_op_fp_arith_ST0_FT0[8] = {
@@ -1093,7 +1333,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
             gen_op_arithc_T0_T1_cc[ot][op - OP_ADCL]();
             gen_op_mov_reg_T0[ot][d]();
         } else {
-            gen_op_arithc_mem_T0_T1_cc[ot][op - OP_ADCL]();
+            gen_op_arithc_mem_T0_T1_cc[ot + s1->mem_index][op - OP_ADCL]();
         }
         s1->cc_op = CC_OP_DYNAMIC;
         goto the_end;
@@ -1172,7 +1412,7 @@ static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
     if (d != OR_TMP0)
         gen_op_shift_T0_T1_cc[ot][op]();
     else
-        gen_op_shift_mem_T0_T1_cc[ot][op]();
+        gen_op_shift_mem_T0_T1_cc[ot + s1->mem_index][op]();
     if (d != OR_TMP0)
         gen_op_mov_reg_T0[ot][d]();
     s1->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
@@ -1187,8 +1427,9 @@ static void gen_shifti(DisasContext *s1, int op, int ot, int d, int c)
 
 static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_ptr)
 {
+    target_long disp;
     int havesib;
-    int base, disp;
+    int base;
     int index;
     int scale;
     int opreg;
@@ -1212,16 +1453,20 @@ static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_
             havesib = 1;
             code = ldub_code(s->pc++);
             scale = (code >> 6) & 3;
-            index = (code >> 3) & 7;
-            base = code & 7;
+            index = ((code >> 3) & 7) | REX_X(s);
+            base = (code & 7);
         }
+        base |= REX_B(s);
 
         switch (mod) {
         case 0:
-            if (base == 5) {
+            if ((base & 7) == 5) {
                 base = -1;
-                disp = ldl_code(s->pc);
+                disp = (int32_t)ldl_code(s->pc);
                 s->pc += 4;
+                if (CODE64(s) && !havesib) {
+                    disp += s->pc + s->rip_offset;
+                }
             } else {
                 disp = 0;
             }
@@ -1240,15 +1485,45 @@ static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_
             /* for correct popl handling with esp */
             if (base == 4 && s->popl_esp_hack)
                 disp += s->popl_esp_hack;
-            gen_op_movl_A0_reg[base]();
-            if (disp != 0)
-                gen_op_addl_A0_im(disp);
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_movq_A0_reg[base]();
+                if (disp != 0) {
+                    if ((int32_t)disp == disp)
+                        gen_op_addq_A0_im(disp);
+                    else
+                        gen_op_addq_A0_im64(disp >> 32, disp);
+                }
+            } else 
+#endif
+            {
+                gen_op_movl_A0_reg[base]();
+                if (disp != 0)
+                    gen_op_addl_A0_im(disp);
+            }
         } else {
-            gen_op_movl_A0_im(disp);
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                if ((int32_t)disp == disp)
+                    gen_op_movq_A0_im(disp);
+                else
+                    gen_op_movq_A0_im64(disp >> 32, disp);
+            } else 
+#endif
+            {
+                gen_op_movl_A0_im(disp);
+            }
         }
         /* XXX: index == 4 is always invalid */
         if (havesib && (index != 4 || scale != 0)) {
-            gen_op_addl_A0_reg_sN[scale][index]();
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_addq_A0_reg_sN[scale][index]();
+            } else 
+#endif
+            {
+                gen_op_addl_A0_reg_sN[scale][index]();
+            }
         }
         if (must_add_seg) {
             if (override < 0) {
@@ -1257,7 +1532,14 @@ static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_
                 else
                     override = R_DS;
             }
-            gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_addq_A0_seg(offsetof(CPUX86State,segs[override].base));
+            } else 
+#endif
+            {
+                gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
+            }
         }
     } else {
         switch (mod) {
@@ -1333,34 +1615,108 @@ static void gen_lea_modrm(DisasContext *s, int modrm, int *reg_ptr, int *offset_
     *offset_ptr = disp;
 }
 
-/* generate modrm memory load or store of 'reg'. TMP0 is used if reg !=
-   OR_TMP0 */
-static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
+static void gen_nop_modrm(DisasContext *s, int modrm)
 {
-    int mod, rm, opreg, disp;
+    int mod, rm, base, code;
 
     mod = (modrm >> 6) & 3;
+    if (mod == 3)
+        return;
     rm = modrm & 7;
-    if (mod == 3) {
-        if (is_store) {
-            if (reg != OR_TMP0)
-                gen_op_mov_TN_reg[ot][0][reg]();
-            gen_op_mov_reg_T0[ot][rm]();
-        } else {
-            gen_op_mov_TN_reg[ot][0][rm]();
-            if (reg != OR_TMP0)
-                gen_op_mov_reg_T0[ot][reg]();
+
+    if (s->aflag) {
+
+        base = rm;
+        
+        if (base == 4) {
+            code = ldub_code(s->pc++);
+            base = (code & 7);
+        }
+        
+        switch (mod) {
+        case 0:
+            if (base == 5) {
+                s->pc += 4;
+            }
+            break;
+        case 1:
+            s->pc++;
+            break;
+        default:
+        case 2:
+            s->pc += 4;
+            break;
         }
     } else {
-        gen_lea_modrm(s, modrm, &opreg, &disp);
-        if (is_store) {
-            if (reg != OR_TMP0)
-                gen_op_mov_TN_reg[ot][0][reg]();
-            gen_op_st_T0_A0[ot + s->mem_index]();
-        } else {
-            gen_op_ld_T0_A0[ot + s->mem_index]();
-            if (reg != OR_TMP0)
-                gen_op_mov_reg_T0[ot][reg]();
+        switch (mod) {
+        case 0:
+            if (rm == 6) {
+                s->pc += 2;
+            }
+            break;
+        case 1:
+            s->pc++;
+            break;
+        default:
+        case 2:
+            s->pc += 2;
+            break;
+        }
+    }
+}
+
+/* used for LEA and MOV AX, mem */
+static void gen_add_A0_ds_seg(DisasContext *s)
+{
+    int override, must_add_seg;
+    must_add_seg = s->addseg;
+    override = R_DS;
+    if (s->override >= 0) {
+        override = s->override;
+        must_add_seg = 1;
+    } else {
+        override = R_DS;
+    }
+    if (must_add_seg) {
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            gen_op_addq_A0_seg(offsetof(CPUX86State,segs[override].base));
+        } else 
+#endif
+        {
+            gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
+        }
+    }
+}
+
+/* generate modrm memory load or store of 'reg'. TMP0 is used if reg !=
+   OR_TMP0 */
+static void gen_ldst_modrm(DisasContext *s, int modrm, int ot, int reg, int is_store)
+{
+    int mod, rm, opreg, disp;
+
+    mod = (modrm >> 6) & 3;
+    rm = (modrm & 7) | REX_B(s);
+    if (mod == 3) {
+        if (is_store) {
+            if (reg != OR_TMP0)
+                gen_op_mov_TN_reg[ot][0][reg]();
+            gen_op_mov_reg_T0[ot][rm]();
+        } else {
+            gen_op_mov_TN_reg[ot][0][rm]();
+            if (reg != OR_TMP0)
+                gen_op_mov_reg_T0[ot][reg]();
+        }
+    } else {
+        gen_lea_modrm(s, modrm, &opreg, &disp);
+        if (is_store) {
+            if (reg != OR_TMP0)
+                gen_op_mov_TN_reg[ot][0][reg]();
+            gen_op_st_T0_A0[ot + s->mem_index]();
+        } else {
+            gen_op_ld_T0_A0[ot + s->mem_index]();
+            if (reg != OR_TMP0)
+                gen_op_mov_reg_T0[ot][reg]();
         }
     }
 }
@@ -1387,11 +1743,47 @@ static inline uint32_t insn_get(DisasContext *s, int ot)
     return ret;
 }
 
-static inline void gen_jcc(DisasContext *s, int b, int val, int next_eip)
+static inline int insn_const_size(unsigned int ot)
+{
+    if (ot <= OT_LONG)
+        return 1 << ot;
+    else
+        return 4;
+}
+
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
+{
+    TranslationBlock *tb;
+    target_ulong pc;
+
+    pc = s->cs_base + eip;
+    tb = s->tb;
+    /* NOTE: we handle the case where the TB spans two pages here */
+    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+        /* jump to same page: we can use a direct jump */
+        if (tb_num == 0)
+            gen_op_goto_tb0(TBPARAM(tb));
+        else
+            gen_op_goto_tb1(TBPARAM(tb));
+        gen_jmp_im(eip);
+        gen_op_movl_T0_im((long)tb + tb_num);
+        gen_op_exit_tb();
+    } else {
+        /* jump to another page: currently not optimized */
+        gen_jmp_im(eip);
+        gen_eob(s);
+    }
+}
+
+static inline void gen_jcc(DisasContext *s, int b, 
+                           target_ulong val, target_ulong next_eip)
 {
     TranslationBlock *tb;
     int inv, jcc_op;
-    GenOpFunc3 *func;
+    GenOpFunc1 *func;
+    target_ulong tmp;
+    int l1, l2;
 
     inv = b & 1;
     jcc_op = (b >> 1) & 7;
@@ -1402,6 +1794,7 @@ static inline void gen_jcc(DisasContext *s, int b, int val, int next_eip)
         case CC_OP_SUBB:
         case CC_OP_SUBW:
         case CC_OP_SUBL:
+        case CC_OP_SUBQ:
             func = gen_jcc_sub[s->cc_op - CC_OP_SUBB][jcc_op];
             break;
             
@@ -1409,33 +1802,48 @@ static inline void gen_jcc(DisasContext *s, int b, int val, int next_eip)
         case CC_OP_ADDB:
         case CC_OP_ADDW:
         case CC_OP_ADDL:
+        case CC_OP_ADDQ:
+
         case CC_OP_ADCB:
         case CC_OP_ADCW:
         case CC_OP_ADCL:
+        case CC_OP_ADCQ:
+
         case CC_OP_SBBB:
         case CC_OP_SBBW:
         case CC_OP_SBBL:
+        case CC_OP_SBBQ:
+
         case CC_OP_LOGICB:
         case CC_OP_LOGICW:
         case CC_OP_LOGICL:
+        case CC_OP_LOGICQ:
+
         case CC_OP_INCB:
         case CC_OP_INCW:
         case CC_OP_INCL:
+        case CC_OP_INCQ:
+
         case CC_OP_DECB:
         case CC_OP_DECW:
         case CC_OP_DECL:
+        case CC_OP_DECQ:
+
         case CC_OP_SHLB:
         case CC_OP_SHLW:
         case CC_OP_SHLL:
+        case CC_OP_SHLQ:
+
         case CC_OP_SARB:
         case CC_OP_SARW:
         case CC_OP_SARL:
+        case CC_OP_SARQ:
             switch(jcc_op) {
             case JCC_Z:
-                func = gen_jcc_sub[(s->cc_op - CC_OP_ADDB) % 3][jcc_op];
+                func = gen_jcc_sub[(s->cc_op - CC_OP_ADDB) % 4][jcc_op];
                 break;
             case JCC_S:
-                func = gen_jcc_sub[(s->cc_op - CC_OP_ADDB) % 3][jcc_op];
+                func = gen_jcc_sub[(s->cc_op - CC_OP_ADDB) % 4][jcc_op];
                 break;
             default:
                 func = NULL;
@@ -1447,32 +1855,52 @@ static inline void gen_jcc(DisasContext *s, int b, int val, int next_eip)
             break;
         }
 
-        if (s->cc_op != CC_OP_DYNAMIC)
+        if (s->cc_op != CC_OP_DYNAMIC) {
             gen_op_set_cc_op(s->cc_op);
+            s->cc_op = CC_OP_DYNAMIC;
+        }
 
         if (!func) {
             gen_setcc_slow[jcc_op]();
-            func = gen_op_jcc;
+            func = gen_op_jnz_T0_label;
         }
     
-        tb = s->tb;
-        if (!inv) {
-            func((long)tb, val, next_eip);
-        } else {
-            func((long)tb, next_eip, val);
+        if (inv) {
+            tmp = val;
+            val = next_eip;
+            next_eip = tmp;
         }
+        tb = s->tb;
+
+        l1 = gen_new_label();
+        func(l1);
+
+        gen_goto_tb(s, 0, next_eip);
+
+        gen_set_label(l1);
+        gen_goto_tb(s, 1, val);
+
         s->is_jmp = 3;
     } else {
+
         if (s->cc_op != CC_OP_DYNAMIC) {
             gen_op_set_cc_op(s->cc_op);
             s->cc_op = CC_OP_DYNAMIC;
         }
         gen_setcc_slow[jcc_op]();
-        if (!inv) {
-            gen_op_jcc_im(val, next_eip);
-        } else {
-            gen_op_jcc_im(next_eip, val);
+        if (inv) {
+            tmp = val;
+            val = next_eip;
+            next_eip = tmp;
         }
+        l1 = gen_new_label();
+        l2 = gen_new_label();
+        gen_op_jnz_T0_label(l1);
+        gen_jmp_im(next_eip);
+        gen_op_jmp_label(l2);
+        gen_set_label(l1);
+        gen_jmp_im(val);
+        gen_set_label(l2);
         gen_eob(s);
     }
 }
@@ -1489,6 +1917,7 @@ static void gen_setcc(DisasContext *s, int b)
     case CC_OP_SUBB:
     case CC_OP_SUBW:
     case CC_OP_SUBL:
+    case CC_OP_SUBQ:
         func = gen_setcc_sub[s->cc_op - CC_OP_SUBB][jcc_op];
         if (!func)
             goto slow_jcc;
@@ -1498,24 +1927,33 @@ static void gen_setcc(DisasContext *s, int b)
     case CC_OP_ADDB:
     case CC_OP_ADDW:
     case CC_OP_ADDL:
+    case CC_OP_ADDQ:
+
     case CC_OP_LOGICB:
     case CC_OP_LOGICW:
     case CC_OP_LOGICL:
+    case CC_OP_LOGICQ:
+
     case CC_OP_INCB:
     case CC_OP_INCW:
     case CC_OP_INCL:
+    case CC_OP_INCQ:
+
     case CC_OP_DECB:
     case CC_OP_DECW:
     case CC_OP_DECL:
+    case CC_OP_DECQ:
+
     case CC_OP_SHLB:
     case CC_OP_SHLW:
     case CC_OP_SHLL:
+    case CC_OP_SHLQ:
         switch(jcc_op) {
         case JCC_Z:
-            func = gen_setcc_sub[(s->cc_op - CC_OP_ADDB) % 3][jcc_op];
+            func = gen_setcc_sub[(s->cc_op - CC_OP_ADDB) % 4][jcc_op];
             break;
         case JCC_S:
-            func = gen_setcc_sub[(s->cc_op - CC_OP_ADDB) % 3][jcc_op];
+            func = gen_setcc_sub[(s->cc_op - CC_OP_ADDB) % 4][jcc_op];
             break;
         default:
             goto slow_jcc;
@@ -1536,68 +1974,37 @@ static void gen_setcc(DisasContext *s, int b)
 
 /* move T0 to seg_reg and compute if the CPU state may change. Never
    call this function with seg_reg == R_CS */
-static void gen_movl_seg_T0(DisasContext *s, int seg_reg, unsigned int cur_eip)
+static void gen_movl_seg_T0(DisasContext *s, int seg_reg, target_ulong cur_eip)
 {
-    if (s->pe && !s->vm86)
-        gen_op_movl_seg_T0(seg_reg, cur_eip);
-    else
-        gen_op_movl_seg_T0_vm(offsetof(CPUX86State,segs[seg_reg]));
-    /* abort translation because the register may have a non zero base
-       or because ss32 may change. For R_SS, translation must always
-       stop as a special handling must be done to disable hardware
-       interrupts for the next instruction */
-    if (seg_reg == R_SS || (!s->addseg && seg_reg < R_FS))
-        s->is_jmp = 3;
-}
-
-/* generate a push. It depends on ss32, addseg and dflag */
-static void gen_push_T0(DisasContext *s)
-{
-    if (s->ss32) {
-        if (!s->addseg) {
-            if (s->dflag)
-                gen_op_pushl_T0();
-            else
-                gen_op_pushw_T0();
-        } else {
-            if (s->dflag)
-                gen_op_pushl_ss32_T0();
-            else
-                gen_op_pushw_ss32_T0();
-        }
-    } else {
-        if (s->dflag)
-            gen_op_pushl_ss16_T0();
-        else
-            gen_op_pushw_ss16_T0();
-    }
-}
-
-/* two step pop is necessary for precise exceptions */
-static void gen_pop_T0(DisasContext *s)
-{
-    if (s->ss32) {
-        if (!s->addseg) {
-            if (s->dflag)
-                gen_op_popl_T0();
-            else
-                gen_op_popw_T0();
-        } else {
-            if (s->dflag)
-                gen_op_popl_ss32_T0();
-            else
-                gen_op_popw_ss32_T0();
-        }
+    if (s->pe && !s->vm86) {
+        /* XXX: optimize by finding processor state dynamically */
+        if (s->cc_op != CC_OP_DYNAMIC)
+            gen_op_set_cc_op(s->cc_op);
+        gen_jmp_im(cur_eip);
+        gen_op_movl_seg_T0(seg_reg);
+        /* abort translation because the addseg value may change or
+           because ss32 may change. For R_SS, translation must always
+           stop as a special handling must be done to disable hardware
+           interrupts for the next instruction */
+        if (seg_reg == R_SS || (s->code32 && seg_reg < R_FS))
+            s->is_jmp = 3;
     } else {
-        if (s->dflag)
-            gen_op_popl_ss16_T0();
-        else
-            gen_op_popw_ss16_T0();
+        gen_op_movl_seg_T0_vm(offsetof(CPUX86State,segs[seg_reg]));
+        if (seg_reg == R_SS)
+            s->is_jmp = 3;
     }
 }
 
 static inline void gen_stack_update(DisasContext *s, int addend)
 {
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        if (addend == 8)
+            gen_op_addq_ESP_8();
+        else 
+            gen_op_addq_ESP_im(addend);
+    } else
+#endif
     if (s->ss32) {
         if (addend == 2)
             gen_op_addl_ESP_2();
@@ -1615,9 +2022,118 @@ static inline void gen_stack_update(DisasContext *s, int addend)
     }
 }
 
+/* generate a push. It depends on ss32, addseg and dflag */
+static void gen_push_T0(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg[R_ESP]();
+        if (s->dflag) {
+            gen_op_subq_A0_8();
+            gen_op_st_T0_A0[OT_QUAD + s->mem_index]();
+        } else {
+            gen_op_subq_A0_2();
+            gen_op_st_T0_A0[OT_WORD + s->mem_index]();
+        }
+        gen_op_movq_ESP_A0();
+    } else 
+#endif
+    {
+        gen_op_movl_A0_reg[R_ESP]();
+        if (!s->dflag)
+            gen_op_subl_A0_2();
+        else
+            gen_op_subl_A0_4();
+        if (s->ss32) {
+            if (s->addseg) {
+                gen_op_movl_T1_A0();
+                gen_op_addl_A0_SS();
+            }
+        } else {
+            gen_op_andl_A0_ffff();
+            gen_op_movl_T1_A0();
+            gen_op_addl_A0_SS();
+        }
+        gen_op_st_T0_A0[s->dflag + 1 + s->mem_index]();
+        if (s->ss32 && !s->addseg)
+            gen_op_movl_ESP_A0();
+        else
+            gen_op_mov_reg_T1[s->ss32 + 1][R_ESP]();
+    }
+}
+
+/* generate a push. It depends on ss32, addseg and dflag */
+/* slower version for T1, only used for call Ev */
+static void gen_push_T1(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg[R_ESP]();
+        if (s->dflag) {
+            gen_op_subq_A0_8();
+            gen_op_st_T1_A0[OT_QUAD + s->mem_index]();
+        } else {
+            gen_op_subq_A0_2();
+            gen_op_st_T0_A0[OT_WORD + s->mem_index]();
+        }
+        gen_op_movq_ESP_A0();
+    } else 
+#endif
+    {
+        gen_op_movl_A0_reg[R_ESP]();
+        if (!s->dflag)
+            gen_op_subl_A0_2();
+        else
+            gen_op_subl_A0_4();
+        if (s->ss32) {
+            if (s->addseg) {
+                gen_op_addl_A0_SS();
+            }
+        } else {
+            gen_op_andl_A0_ffff();
+            gen_op_addl_A0_SS();
+        }
+        gen_op_st_T1_A0[s->dflag + 1 + s->mem_index]();
+        
+        if (s->ss32 && !s->addseg)
+            gen_op_movl_ESP_A0();
+        else
+            gen_stack_update(s, (-2) << s->dflag);
+    }
+}
+
+/* two step pop is necessary for precise exceptions */
+static void gen_pop_T0(DisasContext *s)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        gen_op_movq_A0_reg[R_ESP]();
+        gen_op_ld_T0_A0[(s->dflag ? OT_QUAD : OT_WORD) + s->mem_index]();
+    } else 
+#endif
+    {
+        gen_op_movl_A0_reg[R_ESP]();
+        if (s->ss32) {
+            if (s->addseg)
+                gen_op_addl_A0_SS();
+        } else {
+            gen_op_andl_A0_ffff();
+            gen_op_addl_A0_SS();
+        }
+        gen_op_ld_T0_A0[s->dflag + 1 + s->mem_index]();
+    }
+}
+
 static void gen_pop_update(DisasContext *s)
 {
-    gen_stack_update(s, 2 << s->dflag);
+#ifdef TARGET_X86_64
+    if (CODE64(s) && s->dflag) {
+        gen_stack_update(s, 8);
+    } else
+#endif
+    {
+        gen_stack_update(s, 2 << s->dflag);
+    }
 }
 
 static void gen_stack_A0(DisasContext *s)
@@ -1646,7 +2162,7 @@ static void gen_pusha(DisasContext *s)
         gen_op_st_T0_A0[OT_WORD + s->dflag + s->mem_index]();
         gen_op_addl_A0_im(2 <<  s->dflag);
     }
-    gen_op_mov_reg_T1[OT_WORD + s->dflag][R_ESP]();
+    gen_op_mov_reg_T1[OT_WORD + s->ss32][R_ESP]();
 }
 
 /* NOTE: wrap around in 16 bit not fully handled */
@@ -1668,54 +2184,62 @@ static void gen_popa(DisasContext *s)
         }
         gen_op_addl_A0_im(2 <<  s->dflag);
     }
-    gen_op_mov_reg_T1[OT_WORD + s->dflag][R_ESP]();
+    gen_op_mov_reg_T1[OT_WORD + s->ss32][R_ESP]();
 }
 
-/* NOTE: wrap around in 16 bit not fully handled */
-/* XXX: check this */
 static void gen_enter(DisasContext *s, int esp_addend, int level)
 {
-    int ot, level1, addend, opsize;
+    int ot, opsize;
 
-    ot = s->dflag + OT_WORD;
     level &= 0x1f;
-    level1 = level;
-    opsize = 2 << s->dflag;
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        ot = s->dflag ? OT_QUAD : OT_WORD;
+        opsize = 1 << ot;
+        
+        gen_op_movl_A0_ESP();
+        gen_op_addq_A0_im(-opsize);
+        gen_op_movl_T1_A0();
 
-    gen_op_movl_A0_ESP();
-    gen_op_addl_A0_im(-opsize);
-    if (!s->ss32)
-        gen_op_andl_A0_ffff();
-    gen_op_movl_T1_A0();
-    if (s->addseg)
-        gen_op_addl_A0_seg(offsetof(CPUX86State,segs[R_SS].base));
-    /* push bp */
-    gen_op_mov_TN_reg[OT_LONG][0][R_EBP]();
-    gen_op_st_T0_A0[ot + s->mem_index]();
-    if (level) {
-        while (level--) {
-            gen_op_addl_A0_im(-opsize);
-            gen_op_addl_T0_im(-opsize);
-            gen_op_st_T0_A0[ot + s->mem_index]();
+        /* push bp */
+        gen_op_mov_TN_reg[OT_LONG][0][R_EBP]();
+        gen_op_st_T0_A0[ot + s->mem_index]();
+        if (level) {
+            gen_op_enter64_level(level, (ot == OT_QUAD));
         }
+        gen_op_mov_reg_T1[ot][R_EBP]();
+        gen_op_addl_T1_im( -esp_addend + (-opsize * level) );
+        gen_op_mov_reg_T1[OT_QUAD][R_ESP]();
+    } else 
+#endif
+    {
+        ot = s->dflag + OT_WORD;
+        opsize = 2 << s->dflag;
+        
+        gen_op_movl_A0_ESP();
         gen_op_addl_A0_im(-opsize);
-        /* XXX: add st_T1_A0 ? */
-        gen_op_movl_T0_T1();
+        if (!s->ss32)
+            gen_op_andl_A0_ffff();
+        gen_op_movl_T1_A0();
+        if (s->addseg)
+            gen_op_addl_A0_seg(offsetof(CPUX86State,segs[R_SS].base));
+        /* push bp */
+        gen_op_mov_TN_reg[OT_LONG][0][R_EBP]();
         gen_op_st_T0_A0[ot + s->mem_index]();
+        if (level) {
+            gen_op_enter_level(level, s->dflag);
+        }
+        gen_op_mov_reg_T1[ot][R_EBP]();
+        gen_op_addl_T1_im( -esp_addend + (-opsize * level) );
+        gen_op_mov_reg_T1[OT_WORD + s->ss32][R_ESP]();
     }
-    gen_op_mov_reg_T1[ot][R_EBP]();
-    addend = -esp_addend;
-    if (level1)
-        addend -= opsize * (level1 + 1);
-    gen_op_addl_T1_im(addend);
-    gen_op_mov_reg_T1[ot][R_ESP]();
 }
 
-static void gen_exception(DisasContext *s, int trapno, unsigned int cur_eip)
+static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip)
 {
     if (s->cc_op != CC_OP_DYNAMIC)
         gen_op_set_cc_op(s->cc_op);
-    gen_op_jmp_im(cur_eip);
+    gen_jmp_im(cur_eip);
     gen_op_raise_exception(trapno);
     s->is_jmp = 3;
 }
@@ -1723,20 +2247,20 @@ static void gen_exception(DisasContext *s, int trapno, unsigned int cur_eip)
 /* an interrupt is different from an exception because of the
    priviledge checks */
 static void gen_interrupt(DisasContext *s, int intno, 
-                          unsigned int cur_eip, unsigned int next_eip)
+                          target_ulong cur_eip, target_ulong next_eip)
 {
     if (s->cc_op != CC_OP_DYNAMIC)
         gen_op_set_cc_op(s->cc_op);
-    gen_op_jmp_im(cur_eip);
-    gen_op_raise_interrupt(intno, next_eip);
+    gen_jmp_im(cur_eip);
+    gen_op_raise_interrupt(intno, (int)(next_eip - cur_eip));
     s->is_jmp = 3;
 }
 
-static void gen_debug(DisasContext *s, unsigned int cur_eip)
+static void gen_debug(DisasContext *s, target_ulong cur_eip)
 {
     if (s->cc_op != CC_OP_DYNAMIC)
         gen_op_set_cc_op(s->cc_op);
-    gen_op_jmp_im(cur_eip);
+    gen_jmp_im(cur_eip);
     gen_op_debug();
     s->is_jmp = 3;
 }
@@ -1747,6 +2271,9 @@ static void gen_eob(DisasContext *s)
 {
     if (s->cc_op != CC_OP_DYNAMIC)
         gen_op_set_cc_op(s->cc_op);
+    if (s->tb->flags & HF_INHIBIT_IRQ_MASK) {
+        gen_op_reset_inhibit_irq();
+    }
     if (s->singlestep_enabled) {
         gen_op_debug();
     } else if (s->tf) {
@@ -1760,80 +2287,956 @@ static void gen_eob(DisasContext *s)
 
 /* generate a jump to eip. No segment change must happen before as a
    direct call to the next block may occur */
-static void gen_jmp(DisasContext *s, unsigned int eip)
+static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
 {
-    TranslationBlock *tb = s->tb;
-
     if (s->jmp_opt) {
-        if (s->cc_op != CC_OP_DYNAMIC)
+        if (s->cc_op != CC_OP_DYNAMIC) {
             gen_op_set_cc_op(s->cc_op);
-        gen_op_jmp((long)tb, eip);
+            s->cc_op = CC_OP_DYNAMIC;
+        }
+        gen_goto_tb(s, tb_num, eip);
         s->is_jmp = 3;
     } else {
-        gen_op_jmp_im(eip);
-        gen_eob(s);
+        gen_jmp_im(eip);
+        gen_eob(s);
+    }
+}
+
+static void gen_jmp(DisasContext *s, target_ulong eip)
+{
+    gen_jmp_tb(s, eip, 0);
+}
+
+static void gen_movtl_T0_im(target_ulong val)
+{
+#ifdef TARGET_X86_64    
+    if ((int32_t)val == val) {
+        gen_op_movl_T0_im(val);
+    } else {
+        gen_op_movq_T0_im64(val >> 32, val);
+    }
+#else
+    gen_op_movl_T0_im(val);
+#endif
+}
+
+static void gen_movtl_T1_im(target_ulong val)
+{
+#ifdef TARGET_X86_64    
+    if ((int32_t)val == val) {
+        gen_op_movl_T1_im(val);
+    } else {
+        gen_op_movq_T1_im64(val >> 32, val);
+    }
+#else
+    gen_op_movl_T1_im(val);
+#endif
+}
+
+static void gen_add_A0_im(DisasContext *s, int val)
+{
+#ifdef TARGET_X86_64
+    if (CODE64(s))
+        gen_op_addq_A0_im(val);
+    else
+#endif
+        gen_op_addl_A0_im(val);
+}
+
+static GenOpFunc1 *gen_ldq_env_A0[3] = {
+    gen_op_ldq_raw_env_A0,
+#ifndef CONFIG_USER_ONLY
+    gen_op_ldq_kernel_env_A0,
+    gen_op_ldq_user_env_A0,
+#endif
+};
+
+static GenOpFunc1 *gen_stq_env_A0[3] = {
+    gen_op_stq_raw_env_A0,
+#ifndef CONFIG_USER_ONLY
+    gen_op_stq_kernel_env_A0,
+    gen_op_stq_user_env_A0,
+#endif
+};
+
+static GenOpFunc1 *gen_ldo_env_A0[3] = {
+    gen_op_ldo_raw_env_A0,
+#ifndef CONFIG_USER_ONLY
+    gen_op_ldo_kernel_env_A0,
+    gen_op_ldo_user_env_A0,
+#endif
+};
+
+static GenOpFunc1 *gen_sto_env_A0[3] = {
+    gen_op_sto_raw_env_A0,
+#ifndef CONFIG_USER_ONLY
+    gen_op_sto_kernel_env_A0,
+    gen_op_sto_user_env_A0,
+#endif
+};
+
+#define SSE_SPECIAL ((GenOpFunc2 *)1)
+
+#define MMX_OP2(x) { gen_op_ ## x ## _mmx, gen_op_ ## x ## _xmm }
+#define SSE_FOP(x) { gen_op_ ## x ## ps, gen_op_ ## x ## pd, \
+                     gen_op_ ## x ## ss, gen_op_ ## x ## sd, }
+
+static GenOpFunc2 *sse_op_table1[256][4] = {
+    /* pure SSE operations */
+    [0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
+    [0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */
+    [0x12] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd, movsldup, movddup */
+    [0x13] = { SSE_SPECIAL, SSE_SPECIAL },  /* movlps, movlpd */
+    [0x14] = { gen_op_punpckldq_xmm, gen_op_punpcklqdq_xmm },
+    [0x15] = { gen_op_punpckhdq_xmm, gen_op_punpckhqdq_xmm },
+    [0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd, movshdup */
+    [0x17] = { SSE_SPECIAL, SSE_SPECIAL },  /* movhps, movhpd */
+
+    [0x28] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
+    [0x29] = { SSE_SPECIAL, SSE_SPECIAL },  /* movaps, movapd */
+    [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
+    [0x2b] = { SSE_SPECIAL, SSE_SPECIAL },  /* movntps, movntpd */
+    [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
+    [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
+    [0x2e] = { gen_op_ucomiss, gen_op_ucomisd },
+    [0x2f] = { gen_op_comiss, gen_op_comisd },
+    [0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */
+    [0x51] = SSE_FOP(sqrt),
+    [0x52] = { gen_op_rsqrtps, NULL, gen_op_rsqrtss, NULL },
+    [0x53] = { gen_op_rcpps, NULL, gen_op_rcpss, NULL },
+    [0x54] = { gen_op_pand_xmm, gen_op_pand_xmm }, /* andps, andpd */
+    [0x55] = { gen_op_pandn_xmm, gen_op_pandn_xmm }, /* andnps, andnpd */
+    [0x56] = { gen_op_por_xmm, gen_op_por_xmm }, /* orps, orpd */
+    [0x57] = { gen_op_pxor_xmm, gen_op_pxor_xmm }, /* xorps, xorpd */
+    [0x58] = SSE_FOP(add),
+    [0x59] = SSE_FOP(mul),
+    [0x5a] = { gen_op_cvtps2pd, gen_op_cvtpd2ps, 
+               gen_op_cvtss2sd, gen_op_cvtsd2ss },
+    [0x5b] = { gen_op_cvtdq2ps, gen_op_cvtps2dq, gen_op_cvttps2dq },
+    [0x5c] = SSE_FOP(sub),
+    [0x5d] = SSE_FOP(min),
+    [0x5e] = SSE_FOP(div),
+    [0x5f] = SSE_FOP(max),
+
+    [0xc2] = SSE_FOP(cmpeq),
+    [0xc6] = { (GenOpFunc2 *)gen_op_shufps, (GenOpFunc2 *)gen_op_shufpd },
+
+    /* MMX ops and their SSE extensions */
+    [0x60] = MMX_OP2(punpcklbw),
+    [0x61] = MMX_OP2(punpcklwd),
+    [0x62] = MMX_OP2(punpckldq),
+    [0x63] = MMX_OP2(packsswb),
+    [0x64] = MMX_OP2(pcmpgtb),
+    [0x65] = MMX_OP2(pcmpgtw),
+    [0x66] = MMX_OP2(pcmpgtl),
+    [0x67] = MMX_OP2(packuswb),
+    [0x68] = MMX_OP2(punpckhbw),
+    [0x69] = MMX_OP2(punpckhwd),
+    [0x6a] = MMX_OP2(punpckhdq),
+    [0x6b] = MMX_OP2(packssdw),
+    [0x6c] = { NULL, gen_op_punpcklqdq_xmm },
+    [0x6d] = { NULL, gen_op_punpckhqdq_xmm },
+    [0x6e] = { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */
+    [0x6f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, , movqdu */
+    [0x70] = { (GenOpFunc2 *)gen_op_pshufw_mmx, 
+               (GenOpFunc2 *)gen_op_pshufd_xmm, 
+               (GenOpFunc2 *)gen_op_pshufhw_xmm, 
+               (GenOpFunc2 *)gen_op_pshuflw_xmm },
+    [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
+    [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
+    [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
+    [0x74] = MMX_OP2(pcmpeqb),
+    [0x75] = MMX_OP2(pcmpeqw),
+    [0x76] = MMX_OP2(pcmpeql),
+    [0x77] = { SSE_SPECIAL }, /* emms */
+    [0x7c] = { NULL, gen_op_haddpd, NULL, gen_op_haddps },
+    [0x7d] = { NULL, gen_op_hsubpd, NULL, gen_op_hsubps },
+    [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
+    [0x7f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, movdqu */
+    [0xc4] = { SSE_SPECIAL, SSE_SPECIAL }, /* pinsrw */
+    [0xc5] = { SSE_SPECIAL, SSE_SPECIAL }, /* pextrw */
+    [0xd0] = { NULL, gen_op_addsubpd, NULL, gen_op_addsubps },
+    [0xd1] = MMX_OP2(psrlw),
+    [0xd2] = MMX_OP2(psrld),
+    [0xd3] = MMX_OP2(psrlq),
+    [0xd4] = MMX_OP2(paddq),
+    [0xd5] = MMX_OP2(pmullw),
+    [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+    [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */
+    [0xd8] = MMX_OP2(psubusb),
+    [0xd9] = MMX_OP2(psubusw),
+    [0xda] = MMX_OP2(pminub),
+    [0xdb] = MMX_OP2(pand),
+    [0xdc] = MMX_OP2(paddusb),
+    [0xdd] = MMX_OP2(paddusw),
+    [0xde] = MMX_OP2(pmaxub),
+    [0xdf] = MMX_OP2(pandn),
+    [0xe0] = MMX_OP2(pavgb),
+    [0xe1] = MMX_OP2(psraw),
+    [0xe2] = MMX_OP2(psrad),
+    [0xe3] = MMX_OP2(pavgw),
+    [0xe4] = MMX_OP2(pmulhuw),
+    [0xe5] = MMX_OP2(pmulhw),
+    [0xe6] = { NULL, gen_op_cvttpd2dq, gen_op_cvtdq2pd, gen_op_cvtpd2dq },
+    [0xe7] = { SSE_SPECIAL , SSE_SPECIAL },  /* movntq, movntq */
+    [0xe8] = MMX_OP2(psubsb),
+    [0xe9] = MMX_OP2(psubsw),
+    [0xea] = MMX_OP2(pminsw),
+    [0xeb] = MMX_OP2(por),
+    [0xec] = MMX_OP2(paddsb),
+    [0xed] = MMX_OP2(paddsw),
+    [0xee] = MMX_OP2(pmaxsw),
+    [0xef] = MMX_OP2(pxor),
+    [0xf0] = { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu */
+    [0xf1] = MMX_OP2(psllw),
+    [0xf2] = MMX_OP2(pslld),
+    [0xf3] = MMX_OP2(psllq),
+    [0xf4] = MMX_OP2(pmuludq),
+    [0xf5] = MMX_OP2(pmaddwd),
+    [0xf6] = MMX_OP2(psadbw),
+    [0xf7] = MMX_OP2(maskmov),
+    [0xf8] = MMX_OP2(psubb),
+    [0xf9] = MMX_OP2(psubw),
+    [0xfa] = MMX_OP2(psubl),
+    [0xfb] = MMX_OP2(psubq),
+    [0xfc] = MMX_OP2(paddb),
+    [0xfd] = MMX_OP2(paddw),
+    [0xfe] = MMX_OP2(paddl),
+};
+
+static GenOpFunc2 *sse_op_table2[3 * 8][2] = {
+    [0 + 2] = MMX_OP2(psrlw),
+    [0 + 4] = MMX_OP2(psraw),
+    [0 + 6] = MMX_OP2(psllw),
+    [8 + 2] = MMX_OP2(psrld),
+    [8 + 4] = MMX_OP2(psrad),
+    [8 + 6] = MMX_OP2(pslld),
+    [16 + 2] = MMX_OP2(psrlq),
+    [16 + 3] = { NULL, gen_op_psrldq_xmm },
+    [16 + 6] = MMX_OP2(psllq),
+    [16 + 7] = { NULL, gen_op_pslldq_xmm },
+};
+
+static GenOpFunc1 *sse_op_table3[4 * 3] = {
+    gen_op_cvtsi2ss,
+    gen_op_cvtsi2sd,
+    X86_64_ONLY(gen_op_cvtsq2ss),
+    X86_64_ONLY(gen_op_cvtsq2sd),
+    
+    gen_op_cvttss2si,
+    gen_op_cvttsd2si,
+    X86_64_ONLY(gen_op_cvttss2sq),
+    X86_64_ONLY(gen_op_cvttsd2sq),
+
+    gen_op_cvtss2si,
+    gen_op_cvtsd2si,
+    X86_64_ONLY(gen_op_cvtss2sq),
+    X86_64_ONLY(gen_op_cvtsd2sq),
+};
+    
+static GenOpFunc2 *sse_op_table4[8][4] = {
+    SSE_FOP(cmpeq),
+    SSE_FOP(cmplt),
+    SSE_FOP(cmple),
+    SSE_FOP(cmpunord),
+    SSE_FOP(cmpneq),
+    SSE_FOP(cmpnlt),
+    SSE_FOP(cmpnle),
+    SSE_FOP(cmpord),
+};
+    
+static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
+{
+    int b1, op1_offset, op2_offset, is_xmm, val, ot;
+    int modrm, mod, rm, reg, reg_addr, offset_addr;
+    GenOpFunc2 *sse_op2;
+    GenOpFunc3 *sse_op3;
+
+    b &= 0xff;
+    if (s->prefix & PREFIX_DATA) 
+        b1 = 1;
+    else if (s->prefix & PREFIX_REPZ) 
+        b1 = 2;
+    else if (s->prefix & PREFIX_REPNZ) 
+        b1 = 3;
+    else
+        b1 = 0;
+    sse_op2 = sse_op_table1[b][b1];
+    if (!sse_op2) 
+        goto illegal_op;
+    if (b <= 0x5f || b == 0xc6 || b == 0xc2) {
+        is_xmm = 1;
+    } else {
+        if (b1 == 0) {
+            /* MMX case */
+            is_xmm = 0;
+        } else {
+            is_xmm = 1;
+        }
+    }
+    /* simple MMX/SSE operation */
+    if (s->flags & HF_TS_MASK) {
+        gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+        return;
+    }
+    if (s->flags & HF_EM_MASK) {
+    illegal_op:
+        gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
+        return;
+    }
+    if (is_xmm && !(s->flags & HF_OSFXSR_MASK))
+        goto illegal_op;
+    if (b == 0x77) {
+        /* emms */
+        gen_op_emms();
+        return;
+    }
+    /* prepare MMX state (XXX: optimize by storing fptt and fptags in
+       the static cpu state) */
+    if (!is_xmm) {
+        gen_op_enter_mmx();
+    }
+
+    modrm = ldub_code(s->pc++);
+    reg = ((modrm >> 3) & 7);
+    if (is_xmm)
+        reg |= rex_r;
+    mod = (modrm >> 6) & 3;
+    if (sse_op2 == SSE_SPECIAL) {
+        b |= (b1 << 8);
+        switch(b) {
+        case 0x0e7: /* movntq */
+            if (mod == 3) 
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
+            break;
+        case 0x1e7: /* movntdq */
+        case 0x02b: /* movntps */
+        case 0x12b: /* movntps */
+        case 0x3f0: /* lddqu */
+            if (mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
+            break;
+        case 0x6e: /* movd mm, ea */
+            gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
+            gen_op_movl_mm_T0_mmx(offsetof(CPUX86State,fpregs[reg].mmx));
+            break;
+        case 0x16e: /* movd xmm, ea */
+            gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0);
+            gen_op_movl_mm_T0_xmm(offsetof(CPUX86State,xmm_regs[reg]));
+            break;
+        case 0x6f: /* movq mm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
+            } else {
+                rm = (modrm & 7);
+                gen_op_movq(offsetof(CPUX86State,fpregs[reg].mmx),
+                            offsetof(CPUX86State,fpregs[rm].mmx));
+            }
+            break;
+        case 0x010: /* movups */
+        case 0x110: /* movupd */
+        case 0x028: /* movaps */
+        case 0x128: /* movapd */
+        case 0x16f: /* movdqa xmm, ea */
+        case 0x26f: /* movdqu xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movo(offsetof(CPUX86State,xmm_regs[reg]),
+                            offsetof(CPUX86State,xmm_regs[rm]));
+            }
+            break;
+        case 0x210: /* movss xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+                gen_op_movl_T0_0();
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)));
+            }
+            break;
+        case 0x310: /* movsd xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+                gen_op_movl_T0_0();
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            break;
+        case 0x012: /* movlps */
+        case 0x112: /* movlpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                /* movhlps */
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
+            }
+            break;
+        case 0x212: /* movsldup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)));
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(2)));
+            }
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)));
+            break;
+        case 0x312: /* movddup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            break;
+        case 0x016: /* movhps */
+        case 0x116: /* movhpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            } else {
+                /* movlhps */
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            break;
+        case 0x216: /* movshdup */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldo_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(1)));
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_L(3)));
+            }
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(1)));
+            gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
+                        offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
+            break;
+        case 0x7e: /* movd ea, mm */
+            gen_op_movl_T0_mm_mmx(offsetof(CPUX86State,fpregs[reg].mmx));
+            gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
+            break;
+        case 0x17e: /* movd ea, xmm */
+            gen_op_movl_T0_mm_xmm(offsetof(CPUX86State,xmm_regs[reg]));
+            gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1);
+            break;
+        case 0x27e: /* movq xmm, ea */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            }
+            gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            break;
+        case 0x7f: /* movq ea, mm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,fpregs[reg].mmx));
+            } else {
+                rm = (modrm & 7);
+                gen_op_movq(offsetof(CPUX86State,fpregs[rm].mmx),
+                            offsetof(CPUX86State,fpregs[reg].mmx));
+            }
+            break;
+        case 0x011: /* movups */
+        case 0x111: /* movupd */
+        case 0x029: /* movaps */
+        case 0x129: /* movapd */
+        case 0x17f: /* movdqa ea, xmm */
+        case 0x27f: /* movdqu ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_sto_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg]));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movo(offsetof(CPUX86State,xmm_regs[rm]),
+                            offsetof(CPUX86State,xmm_regs[reg]));
+            }
+            break;
+        case 0x211: /* movss ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_movl_T0_env(offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+                gen_op_st_T0_A0[OT_LONG + s->mem_index]();
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movl(offsetof(CPUX86State,xmm_regs[rm].XMM_L(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_L(0)));
+            }
+            break;
+        case 0x311: /* movsd ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            }
+            break;
+        case 0x013: /* movlps */
+        case 0x113: /* movlpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                goto illegal_op;
+            }
+            break;
+        case 0x017: /* movhps */
+        case 0x117: /* movhpd */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            } else {
+                goto illegal_op;
+            }
+            break;
+        case 0x71: /* shift mm, im */
+        case 0x72:
+        case 0x73:
+        case 0x171: /* shift xmm, im */
+        case 0x172:
+        case 0x173:
+            val = ldub_code(s->pc++);
+            if (is_xmm) {
+                gen_op_movl_T0_im(val);
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                gen_op_movl_T0_0();
+                gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(1)));
+                op1_offset = offsetof(CPUX86State,xmm_t0);
+            } else {
+                gen_op_movl_T0_im(val);
+                gen_op_movl_env_T0(offsetof(CPUX86State,mmx_t0.MMX_L(0)));
+                gen_op_movl_T0_0();
+                gen_op_movl_env_T0(offsetof(CPUX86State,mmx_t0.MMX_L(1)));
+                op1_offset = offsetof(CPUX86State,mmx_t0);
+            }
+            sse_op2 = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1];
+            if (!sse_op2)
+                goto illegal_op;
+            if (is_xmm) {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+            sse_op2(op2_offset, op1_offset);
+            break;
+        case 0x050: /* movmskps */
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_movmskps(offsetof(CPUX86State,xmm_regs[rm]));
+            gen_op_mov_reg_T0[OT_LONG][reg]();
+            break;
+        case 0x150: /* movmskpd */
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_movmskpd(offsetof(CPUX86State,xmm_regs[rm]));
+            gen_op_mov_reg_T0[OT_LONG][reg]();
+            break;
+        case 0x02a: /* cvtpi2ps */
+        case 0x12a: /* cvtpi2pd */
+            gen_op_enter_mmx();
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,mmx_t0);
+                gen_ldq_env_A0[s->mem_index >> 2](op2_offset);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            switch(b >> 8) {
+            case 0x0:
+                gen_op_cvtpi2ps(op1_offset, op2_offset);
+                break;
+            default:
+            case 0x1:
+                gen_op_cvtpi2pd(op1_offset, op2_offset);
+                break;
+            }
+            break;
+        case 0x22a: /* cvtsi2ss */
+        case 0x32a: /* cvtsi2sd */
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2)](op1_offset);
+            break;
+        case 0x02c: /* cvttps2pi */
+        case 0x12c: /* cvttpd2pi */
+        case 0x02d: /* cvtps2pi */
+        case 0x12d: /* cvtpd2pi */
+            gen_op_enter_mmx();
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+                gen_ldo_env_A0[s->mem_index >> 2](op2_offset);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+            op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx);
+            switch(b) {
+            case 0x02c:
+                gen_op_cvttps2pi(op1_offset, op2_offset);
+                break;
+            case 0x12c:
+                gen_op_cvttpd2pi(op1_offset, op2_offset);
+                break;
+            case 0x02d:
+                gen_op_cvtps2pi(op1_offset, op2_offset);
+                break;
+            case 0x12d:
+                gen_op_cvtpd2pi(op1_offset, op2_offset);
+                break;
+            }
+            break;
+        case 0x22c: /* cvttss2si */
+        case 0x32c: /* cvttsd2si */
+        case 0x22d: /* cvtss2si */
+        case 0x32d: /* cvtsd2si */
+            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                if ((b >> 8) & 1) {
+                    gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_t0.XMM_Q(0)));
+                } else {
+                    gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                    gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                }
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+            sse_op_table3[(s->dflag == 2) * 2 + ((b >> 8) - 2) + 4 + 
+                          (b & 1) * 4](op2_offset);
+            gen_op_mov_reg_T0[ot][reg]();
+            break;
+        case 0xc4: /* pinsrw */
+        case 0x1c4: 
+            s->rip_offset = 1;
+            gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
+            val = ldub_code(s->pc++);
+            if (b1) {
+                val &= 7;
+                gen_op_pinsrw_xmm(offsetof(CPUX86State,xmm_regs[reg]), val);
+            } else {
+                val &= 3;
+                gen_op_pinsrw_mmx(offsetof(CPUX86State,fpregs[reg].mmx), val);
+            }
+            break;
+        case 0xc5: /* pextrw */
+        case 0x1c5: 
+            if (mod != 3)
+                goto illegal_op;
+            val = ldub_code(s->pc++);
+            if (b1) {
+                val &= 7;
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_pextrw_xmm(offsetof(CPUX86State,xmm_regs[rm]), val);
+            } else {
+                val &= 3;
+                rm = (modrm & 7);
+                gen_op_pextrw_mmx(offsetof(CPUX86State,fpregs[rm].mmx), val);
+            }
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_op_mov_reg_T0[OT_LONG][reg]();
+            break;
+        case 0x1d6: /* movq ea, xmm */
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_stq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_movq(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)),
+                            offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)));
+                gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[rm].XMM_Q(1)));
+            }
+            break;
+        case 0x2d6: /* movq2dq */
+            gen_op_enter_mmx();
+            rm = (modrm & 7);
+            gen_op_movq(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0)),
+                        offsetof(CPUX86State,fpregs[rm].mmx));
+            gen_op_movq_env_0(offsetof(CPUX86State,xmm_regs[reg].XMM_Q(1)));
+            break;
+        case 0x3d6: /* movdq2q */
+            gen_op_enter_mmx();
+            rm = (modrm & 7) | REX_B(s);
+            gen_op_movq(offsetof(CPUX86State,fpregs[reg & 7].mmx),
+                        offsetof(CPUX86State,xmm_regs[rm].XMM_Q(0)));
+            break;
+        case 0xd7: /* pmovmskb */
+        case 0x1d7:
+            if (mod != 3)
+                goto illegal_op;
+            if (b1) {
+                rm = (modrm & 7) | REX_B(s);
+                gen_op_pmovmskb_xmm(offsetof(CPUX86State,xmm_regs[rm]));
+            } else {
+                rm = (modrm & 7);
+                gen_op_pmovmskb_mmx(offsetof(CPUX86State,fpregs[rm].mmx));
+            }
+            reg = ((modrm >> 3) & 7) | rex_r;
+            gen_op_mov_reg_T0[OT_LONG][reg]();
+            break;
+        default:
+            goto illegal_op;
+        }
+    } else {
+        /* generic MMX or SSE operation */
+        switch(b) {
+        case 0xf7:
+            /* maskmov : we must prepare A0 */
+            if (mod != 3) 
+                goto illegal_op;
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                gen_op_movq_A0_reg[R_EDI]();
+            } else 
+#endif
+            {
+                gen_op_movl_A0_reg[R_EDI]();
+                if (s->aflag == 0)
+                    gen_op_andl_A0_ffff();
+            }
+            gen_add_A0_ds_seg(s);
+            break;
+        case 0x70: /* pshufx insn */
+        case 0xc6: /* pshufx insn */
+        case 0xc2: /* compare insns */
+            s->rip_offset = 1;
+            break;
+        default:
+            break;
+        }
+        if (is_xmm) {
+            op1_offset = offsetof(CPUX86State,xmm_regs[reg]);
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,xmm_t0);
+                if (b1 >= 2 && ((b >= 0x50 && b <= 0x5f && b != 0x5b) ||
+                                b == 0xc2)) {
+                    /* specific case for SSE single instructions */
+                    if (b1 == 2) {
+                        /* 32 bit access */
+                        gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                        gen_op_movl_env_T0(offsetof(CPUX86State,xmm_t0.XMM_L(0)));
+                    } else {
+                        /* 64 bit access */
+                        gen_ldq_env_A0[s->mem_index >> 2](offsetof(CPUX86State,xmm_t0.XMM_D(0)));
+                    }
+                } else {
+                    gen_ldo_env_A0[s->mem_index >> 2](op2_offset);
+                }
+            } else {
+                rm = (modrm & 7) | REX_B(s);
+                op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
+            }
+        } else {
+            op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                op2_offset = offsetof(CPUX86State,mmx_t0);
+                gen_ldq_env_A0[s->mem_index >> 2](op2_offset);
+            } else {
+                rm = (modrm & 7);
+                op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            }
+        }
+        switch(b) {
+        case 0x70: /* pshufx insn */
+        case 0xc6: /* pshufx insn */
+            val = ldub_code(s->pc++);
+            sse_op3 = (GenOpFunc3 *)sse_op2;
+            sse_op3(op1_offset, op2_offset, val);
+            break;
+        case 0xc2:
+            /* compare insns */
+            val = ldub_code(s->pc++);
+            if (val >= 8)
+                goto illegal_op;
+            sse_op2 = sse_op_table4[val][b1];
+            sse_op2(op1_offset, op2_offset);
+            break;
+        default:
+            sse_op2(op1_offset, op2_offset);
+            break;
+        }
+        if (b == 0x2e || b == 0x2f) {
+            s->cc_op = CC_OP_EFLAGS;
+        }
     }
 }
 
+
 /* convert one instruction. s->is_jmp is set if the translation must
    be stopped. Return the next pc value */
-static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
+static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
 {
     int b, prefixes, aflag, dflag;
     int shift, ot;
     int modrm, reg, rm, mod, reg_addr, op, opreg, offset_addr, val;
-    unsigned int next_eip;
+    target_ulong next_eip, tval;
+    int rex_w, rex_r;
 
     s->pc = pc_start;
     prefixes = 0;
     aflag = s->code32;
     dflag = s->code32;
     s->override = -1;
+    rex_w = -1;
+    rex_r = 0;
+#ifdef TARGET_X86_64
+    s->rex_x = 0;
+    s->rex_b = 0;
+    x86_64_hregs = 0; 
+#endif
+    s->rip_offset = 0; /* for relative ip address */
  next_byte:
     b = ldub_code(s->pc);
     s->pc++;
     /* check prefixes */
-    switch (b) {
-    case 0xf3:
-        prefixes |= PREFIX_REPZ;
-        goto next_byte;
-    case 0xf2:
-        prefixes |= PREFIX_REPNZ;
-        goto next_byte;
-    case 0xf0:
-        prefixes |= PREFIX_LOCK;
-        goto next_byte;
-    case 0x2e:
-        s->override = R_CS;
-        goto next_byte;
-    case 0x36:
-        s->override = R_SS;
-        goto next_byte;
-    case 0x3e:
-        s->override = R_DS;
-        goto next_byte;
-    case 0x26:
-        s->override = R_ES;
-        goto next_byte;
-    case 0x64:
-        s->override = R_FS;
-        goto next_byte;
-    case 0x65:
-        s->override = R_GS;
-        goto next_byte;
-    case 0x66:
-        prefixes |= PREFIX_DATA;
-        goto next_byte;
-    case 0x67:
-        prefixes |= PREFIX_ADR;
-        goto next_byte;
+#ifdef TARGET_X86_64
+    if (CODE64(s)) {
+        switch (b) {
+        case 0xf3:
+            prefixes |= PREFIX_REPZ;
+            goto next_byte;
+        case 0xf2:
+            prefixes |= PREFIX_REPNZ;
+            goto next_byte;
+        case 0xf0:
+            prefixes |= PREFIX_LOCK;
+            goto next_byte;
+        case 0x2e:
+            s->override = R_CS;
+            goto next_byte;
+        case 0x36:
+            s->override = R_SS;
+            goto next_byte;
+        case 0x3e:
+            s->override = R_DS;
+            goto next_byte;
+        case 0x26:
+            s->override = R_ES;
+            goto next_byte;
+        case 0x64:
+            s->override = R_FS;
+            goto next_byte;
+        case 0x65:
+            s->override = R_GS;
+            goto next_byte;
+        case 0x66:
+            prefixes |= PREFIX_DATA;
+            goto next_byte;
+        case 0x67:
+            prefixes |= PREFIX_ADR;
+            goto next_byte;
+        case 0x40 ... 0x4f:
+            /* REX prefix */
+            rex_w = (b >> 3) & 1;
+            rex_r = (b & 0x4) << 1;
+            s->rex_x = (b & 0x2) << 2;
+            REX_B(s) = (b & 0x1) << 3;
+            x86_64_hregs = 1; /* select uniform byte register addressing */
+            goto next_byte;
+        }
+        if (rex_w == 1) {
+            /* 0x66 is ignored if rex.w is set */
+            dflag = 2;
+        } else {
+            if (prefixes & PREFIX_DATA)
+                dflag ^= 1;
+        }
+        if (!(prefixes & PREFIX_ADR))
+            aflag = 2;
+    } else 
+#endif
+    {
+        switch (b) {
+        case 0xf3:
+            prefixes |= PREFIX_REPZ;
+            goto next_byte;
+        case 0xf2:
+            prefixes |= PREFIX_REPNZ;
+            goto next_byte;
+        case 0xf0:
+            prefixes |= PREFIX_LOCK;
+            goto next_byte;
+        case 0x2e:
+            s->override = R_CS;
+            goto next_byte;
+        case 0x36:
+            s->override = R_SS;
+            goto next_byte;
+        case 0x3e:
+            s->override = R_DS;
+            goto next_byte;
+        case 0x26:
+            s->override = R_ES;
+            goto next_byte;
+        case 0x64:
+            s->override = R_FS;
+            goto next_byte;
+        case 0x65:
+            s->override = R_GS;
+            goto next_byte;
+        case 0x66:
+            prefixes |= PREFIX_DATA;
+            goto next_byte;
+        case 0x67:
+            prefixes |= PREFIX_ADR;
+            goto next_byte;
+        }
+        if (prefixes & PREFIX_DATA)
+            dflag ^= 1;
+        if (prefixes & PREFIX_ADR)
+            aflag ^= 1;
     }
 
-    if (prefixes & PREFIX_DATA)
-        dflag ^= 1;
-    if (prefixes & PREFIX_ADR)
-        aflag ^= 1;
-
     s->prefix = prefixes;
     s->aflag = aflag;
     s->dflag = dflag;
@@ -1869,14 +3272,14 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             if ((b & 1) == 0)
                 ot = OT_BYTE;
             else
-                ot = dflag ? OT_LONG : OT_WORD;
+                ot = dflag + OT_WORD;
             
             switch(f) {
             case 0: /* OP Ev, Gv */
                 modrm = ldub_code(s->pc++);
-                reg = ((modrm >> 3) & 7);
+                reg = ((modrm >> 3) & 7) | rex_r;
                 mod = (modrm >> 6) & 3;
-                rm = modrm & 7;
+                rm = (modrm & 7) | REX_B(s);
                 if (mod != 3) {
                     gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                     opreg = OR_TMP0;
@@ -1897,8 +3300,8 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             case 1: /* OP Gv, Ev */
                 modrm = ldub_code(s->pc++);
                 mod = (modrm >> 6) & 3;
-                reg = ((modrm >> 3) & 7);
-                rm = modrm & 7;
+                reg = ((modrm >> 3) & 7) | rex_r;
+                rm = (modrm & 7) | REX_B(s);
                 if (mod != 3) {
                     gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                     gen_op_ld_T1_A0[ot + s->mem_index]();
@@ -1920,6 +3323,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
 
     case 0x80: /* GRP1 */
     case 0x81:
+    case 0x82:
     case 0x83:
         {
             int val;
@@ -1927,24 +3331,29 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             if ((b & 1) == 0)
                 ot = OT_BYTE;
             else
-                ot = dflag ? OT_LONG : OT_WORD;
+                ot = dflag + OT_WORD;
             
             modrm = ldub_code(s->pc++);
             mod = (modrm >> 6) & 3;
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             op = (modrm >> 3) & 7;
             
             if (mod != 3) {
+                if (b == 0x83)
+                    s->rip_offset = 1;
+                else
+                    s->rip_offset = insn_const_size(ot);
                 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                 opreg = OR_TMP0;
             } else {
-                opreg = rm + OR_EAX;
+                opreg = rm;
             }
 
             switch(b) {
             default:
             case 0x80:
             case 0x81:
+            case 0x82:
                 val = insn_get(s, ot);
                 break;
             case 0x83:
@@ -1971,13 +3380,15 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
 
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
+        rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
         if (mod != 3) {
+            if (op == 0)
+                s->rip_offset = insn_const_size(ot);
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_op_ld_T0_A0[ot + s->mem_index]();
         } else {
@@ -2013,58 +3424,92 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             switch(ot) {
             case OT_BYTE:
                 gen_op_mulb_AL_T0();
+                s->cc_op = CC_OP_MULB;
                 break;
             case OT_WORD:
                 gen_op_mulw_AX_T0();
+                s->cc_op = CC_OP_MULW;
                 break;
             default:
             case OT_LONG:
                 gen_op_mull_EAX_T0();
+                s->cc_op = CC_OP_MULL;
                 break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_op_mulq_EAX_T0();
+                s->cc_op = CC_OP_MULQ;
+                break;
+#endif
             }
-            s->cc_op = CC_OP_MUL;
             break;
         case 5: /* imul */
             switch(ot) {
             case OT_BYTE:
                 gen_op_imulb_AL_T0();
+                s->cc_op = CC_OP_MULB;
                 break;
             case OT_WORD:
                 gen_op_imulw_AX_T0();
+                s->cc_op = CC_OP_MULW;
                 break;
             default:
             case OT_LONG:
                 gen_op_imull_EAX_T0();
+                s->cc_op = CC_OP_MULL;
                 break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_op_imulq_EAX_T0();
+                s->cc_op = CC_OP_MULQ;
+                break;
+#endif
             }
-            s->cc_op = CC_OP_MUL;
             break;
         case 6: /* div */
             switch(ot) {
             case OT_BYTE:
-                gen_op_divb_AL_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_divb_AL_T0();
                 break;
             case OT_WORD:
-                gen_op_divw_AX_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_divw_AX_T0();
                 break;
             default:
             case OT_LONG:
-                gen_op_divl_EAX_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_divl_EAX_T0();
                 break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_divq_EAX_T0();
+                break;
+#endif
             }
             break;
         case 7: /* idiv */
             switch(ot) {
             case OT_BYTE:
-                gen_op_idivb_AL_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_idivb_AL_T0();
                 break;
             case OT_WORD:
-                gen_op_idivw_AX_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_idivw_AX_T0();
                 break;
             default:
             case OT_LONG:
-                gen_op_idivl_EAX_T0(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_idivl_EAX_T0();
                 break;
+#ifdef TARGET_X86_64
+            case OT_QUAD:
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_idivq_EAX_T0();
+                break;
+#endif
             }
             break;
         default:
@@ -2077,15 +3522,28 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
 
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
+        rm = (modrm & 7) | REX_B(s);
         op = (modrm >> 3) & 7;
         if (op >= 2 && b == 0xfe) {
             goto illegal_op;
         }
+        if (CODE64(s)) {
+            if (op == 2 || op == 4) {
+                /* operand size for jumps is 64 bit */
+                ot = OT_QUAD;
+            } else if (op == 3 || op == 5) {
+                /* for call calls, the operand is 16 or 32 bit, even
+                   in long mode */
+                ot = dflag ? OT_LONG : OT_WORD;
+            } else if (op == 6) {
+                /* default push size is 64 bit */
+                ot = dflag ? OT_QUAD : OT_WORD;
+            }
+        }
         if (mod != 3) {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             if (op >= 2 && op != 3 && op != 5)
@@ -2110,25 +3568,25 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             gen_inc(s, ot, opreg, -1);
             break;
         case 2: /* call Ev */
-            /* XXX: optimize if memory (no and is necessary) */
+            /* XXX: optimize if memory (no 'and' is necessary) */
             if (s->dflag == 0)
                 gen_op_andl_T0_ffff();
-            gen_op_jmp_T0();
             next_eip = s->pc - s->cs_base;
-            gen_op_movl_T0_im(next_eip);
-            gen_push_T0(s);
+            gen_movtl_T1_im(next_eip);
+            gen_push_T1(s);
+            gen_op_jmp_T0();
             gen_eob(s);
             break;
         case 3: /* lcall Ev */
             gen_op_ld_T1_A0[ot + s->mem_index]();
-            gen_op_addl_A0_im(1 << (ot - OT_WORD + 1));
+            gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
             gen_op_ldu_T0_A0[OT_WORD + s->mem_index]();
         do_lcall:
             if (s->pe && !s->vm86) {
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_op_jmp_im(pc_start - s->cs_base);
-                gen_op_lcall_protected_T0_T1(dflag, s->pc - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_lcall_protected_T0_T1(dflag, s->pc - pc_start);
             } else {
                 gen_op_lcall_real_T0_T1(dflag, s->pc - s->cs_base);
             }
@@ -2142,14 +3600,14 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             break;
         case 5: /* ljmp Ev */
             gen_op_ld_T1_A0[ot + s->mem_index]();
-            gen_op_addl_A0_im(1 << (ot - OT_WORD + 1));
+            gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
             gen_op_ldu_T0_A0[OT_WORD + s->mem_index]();
         do_ljmp:
             if (s->pe && !s->vm86) {
                 if (s->cc_op != CC_OP_DYNAMIC)
                     gen_op_set_cc_op(s->cc_op);
-                gen_op_jmp_im(pc_start - s->cs_base);
-                gen_op_ljmp_protected_T0_T1();
+                gen_jmp_im(pc_start - s->cs_base);
+                gen_op_ljmp_protected_T0_T1(s->pc - pc_start);
             } else {
                 gen_op_movl_seg_T0_vm(offsetof(CPUX86State,segs[R_CS]));
                 gen_op_movl_T0_T1();
@@ -2170,15 +3628,15 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
 
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
-        reg = (modrm >> 3) & 7;
+        rm = (modrm & 7) | REX_B(s);
+        reg = ((modrm >> 3) & 7) | rex_r;
         
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
-        gen_op_mov_TN_reg[ot][1][reg + OR_EAX]();
+        gen_op_mov_TN_reg[ot][1][reg]();
         gen_op_testl_T0_T1_cc();
         s->cc_op = CC_OP_LOGICB + ot;
         break;
@@ -2188,7 +3646,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         val = insn_get(s, ot);
 
         gen_op_mov_TN_reg[ot][0][OR_EAX]();
@@ -2198,13 +3656,23 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         break;
         
     case 0x98: /* CWDE/CBW */
-        if (dflag)
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_movslq_RAX_EAX();
+        } else
+#endif
+        if (dflag == 1)
             gen_op_movswl_EAX_AX();
         else
             gen_op_movsbw_AX_AL();
         break;
     case 0x99: /* CDQ/CWD */
-        if (dflag)
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_movsqo_RDX_RAX();
+        } else
+#endif
+        if (dflag == 1)
             gen_op_movslq_EDX_EAX();
         else
             gen_op_movswl_DX_AX();
@@ -2212,44 +3680,53 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     case 0x1af: /* imul Gv, Ev */
     case 0x69: /* imul Gv, Ev, I */
     case 0x6b:
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = ((modrm >> 3) & 7) + OR_EAX;
+        reg = ((modrm >> 3) & 7) | rex_r;
+        if (b == 0x69)
+            s->rip_offset = insn_const_size(ot);
+        else if (b == 0x6b)
+            s->rip_offset = 1;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         if (b == 0x69) {
             val = insn_get(s, ot);
             gen_op_movl_T1_im(val);
         } else if (b == 0x6b) {
-            val = insn_get(s, OT_BYTE);
+            val = (int8_t)insn_get(s, OT_BYTE);
             gen_op_movl_T1_im(val);
         } else {
             gen_op_mov_TN_reg[ot][1][reg]();
         }
 
+#ifdef TARGET_X86_64
+        if (ot == OT_QUAD) {
+            gen_op_imulq_T0_T1();
+        } else
+#endif
         if (ot == OT_LONG) {
             gen_op_imull_T0_T1();
         } else {
             gen_op_imulw_T0_T1();
         }
         gen_op_mov_reg_T0[ot][reg]();
-        s->cc_op = CC_OP_MUL;
+        s->cc_op = CC_OP_MULB + ot;
         break;
     case 0x1c0:
     case 0x1c1: /* xadd Ev, Gv */
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3) {
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             gen_op_mov_TN_reg[ot][0][reg]();
             gen_op_mov_TN_reg[ot][1][rm]();
             gen_op_addl_T0_T1();
-            gen_op_mov_reg_T0[ot][rm]();
             gen_op_mov_reg_T1[ot][reg]();
+            gen_op_mov_reg_T0[ot][rm]();
         } else {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_op_mov_TN_reg[ot][0][reg]();
@@ -2266,20 +3743,20 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         gen_op_mov_TN_reg[ot][1][reg]();
         if (mod == 3) {
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             gen_op_mov_TN_reg[ot][0][rm]();
             gen_op_cmpxchg_T0_T1_EAX_cc[ot]();
             gen_op_mov_reg_T0[ot][rm]();
         } else {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_op_ld_T0_A0[ot + s->mem_index]();
-            gen_op_cmpxchg_mem_T0_T1_EAX_cc[ot]();
+            gen_op_cmpxchg_mem_T0_T1_EAX_cc[ot + s->mem_index]();
         }
         s->cc_op = CC_OP_SUBB + ot;
         break;
@@ -2298,25 +3775,37 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         /**************************/
         /* push/pop */
     case 0x50 ... 0x57: /* push */
-        gen_op_mov_TN_reg[OT_LONG][0][b & 7]();
+        gen_op_mov_TN_reg[OT_LONG][0][(b & 7) | REX_B(s)]();
         gen_push_T0(s);
         break;
     case 0x58 ... 0x5f: /* pop */
-        ot = dflag ? OT_LONG : OT_WORD;
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
         gen_pop_T0(s);
         /* NOTE: order is important for pop %sp */
         gen_pop_update(s);
-        gen_op_mov_reg_T0[ot][b & 7]();
+        gen_op_mov_reg_T0[ot][(b & 7) | REX_B(s)]();
         break;
     case 0x60: /* pusha */
+        if (CODE64(s))
+            goto illegal_op;
         gen_pusha(s);
         break;
     case 0x61: /* popa */
+        if (CODE64(s))
+            goto illegal_op;
         gen_popa(s);
         break;
     case 0x68: /* push Iv */
     case 0x6a:
-        ot = dflag ? OT_LONG : OT_WORD;
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
         if (b == 0x68)
             val = insn_get(s, ot);
         else
@@ -2325,18 +3814,22 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         gen_push_T0(s);
         break;
     case 0x8f: /* pop Ev */
-        ot = dflag ? OT_LONG : OT_WORD;
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
         gen_pop_T0(s);
         if (mod == 3) {
             /* NOTE: order is important for pop %sp */
             gen_pop_update(s);
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             gen_op_mov_reg_T0[ot][rm]();
         } else {
             /* NOTE: order is important too for MMU exceptions */
-            s->popl_esp_hack = 2 << dflag;
+            s->popl_esp_hack = 1 << ot;
             gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
             s->popl_esp_hack = 0;
             gen_pop_update(s);
@@ -2353,7 +3846,10 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         break;
     case 0xc9: /* leave */
         /* XXX: exception not precise (ESP is updated before potential exception) */
-        if (s->ss32) {
+        if (CODE64(s)) {
+            gen_op_mov_TN_reg[OT_QUAD][0][R_EBP]();
+            gen_op_mov_reg_T0[OT_QUAD][R_ESP]();
+        } else if (s->ss32) {
             gen_op_mov_TN_reg[OT_LONG][0][R_EBP]();
             gen_op_mov_reg_T0[OT_LONG][R_ESP]();
         } else {
@@ -2361,7 +3857,11 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             gen_op_mov_reg_T0[OT_WORD][R_ESP]();
         }
         gen_pop_T0(s);
-        ot = dflag ? OT_LONG : OT_WORD;
+        if (CODE64(s)) {
+            ot = dflag ? OT_QUAD : OT_WORD;
+        } else {
+            ot = dflag + OT_WORD;
+        }
         gen_op_mov_reg_T0[ot][R_EBP]();
         gen_pop_update(s);
         break;
@@ -2369,6 +3869,8 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     case 0x0e: /* push cs */
     case 0x16: /* push ss */
     case 0x1e: /* push ds */
+        if (CODE64(s))
+            goto illegal_op;
         gen_op_movl_T0_seg(b >> 3);
         gen_push_T0(s);
         break;
@@ -2380,17 +3882,22 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     case 0x07: /* pop es */
     case 0x17: /* pop ss */
     case 0x1f: /* pop ds */
+        if (CODE64(s))
+            goto illegal_op;
         reg = b >> 3;
         gen_pop_T0(s);
         gen_movl_seg_T0(s, reg, pc_start - s->cs_base);
         gen_pop_update(s);
         if (reg == R_SS) {
-            /* if reg == SS, inhibit interrupts/trace */
-            gen_op_set_inhibit_irq();
+            /* if reg == SS, inhibit interrupts/trace. */
+            /* If several instructions disable interrupts, only the
+               _first_ does it */
+            if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                gen_op_set_inhibit_irq();
             s->tf = 0;
         }
         if (s->is_jmp) {
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
         break;
@@ -2400,7 +3907,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         gen_movl_seg_T0(s, (b >> 3) & 7, pc_start - s->cs_base);
         gen_pop_update(s);
         if (s->is_jmp) {
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
         break;
@@ -2412,38 +3919,40 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         
         /* generate a generic store */
-        gen_ldst_modrm(s, modrm, ot, OR_EAX + reg, 1);
+        gen_ldst_modrm(s, modrm, ot, reg, 1);
         break;
     case 0xc6:
     case 0xc7: /* mov Ev, Iv */
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
-        if (mod != 3)
+        if (mod != 3) {
+            s->rip_offset = insn_const_size(ot);
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        }
         val = insn_get(s, ot);
         gen_op_movl_T0_im(val);
         if (mod != 3)
             gen_op_st_T0_A0[ot + s->mem_index]();
         else
-            gen_op_mov_reg_T0[ot][modrm & 7]();
+            gen_op_mov_reg_T0[ot][(modrm & 7) | REX_B(s)]();
         break;
     case 0x8a:
     case 0x8b: /* mov Ev, Gv */
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = OT_WORD + dflag;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_reg_T0[ot][reg]();
@@ -2457,11 +3966,14 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         gen_movl_seg_T0(s, reg, pc_start - s->cs_base);
         if (reg == R_SS) {
             /* if reg == SS, inhibit interrupts/trace */
-            gen_op_set_inhibit_irq();
+            /* If several instructions disable interrupts, only the
+               _first_ does it */
+            if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                gen_op_set_inhibit_irq();
             s->tf = 0;
         }
         if (s->is_jmp) {
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
         break;
@@ -2472,9 +3984,10 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if (reg >= 6)
             goto illegal_op;
         gen_op_movl_T0_seg(reg);
-        ot = OT_WORD;
-        if (mod == 3 && dflag)
-            ot = OT_LONG;
+        if (mod == 3)
+            ot = OT_WORD + dflag;
+        else
+            ot = OT_WORD;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 1);
         break;
 
@@ -2489,9 +4002,9 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             /* ot is the size of source */
             ot = (b & 1) + OT_BYTE;
             modrm = ldub_code(s->pc++);
-            reg = ((modrm >> 3) & 7) + OR_EAX;
+            reg = ((modrm >> 3) & 7) | rex_r;
             mod = (modrm >> 6) & 3;
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             
             if (mod == 3) {
                 gen_op_mov_TN_reg[ot][0][rm]();
@@ -2524,9 +4037,12 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         break;
 
     case 0x8d: /* lea */
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        reg = ((modrm >> 3) & 7) | rex_r;
         /* we must ensure that no segment is added */
         s->override = -1;
         val = s->addseg;
@@ -2540,76 +4056,87 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     case 0xa1:
     case 0xa2: /* mov Ov, EAX */
     case 0xa3:
-        if ((b & 1) == 0)
-            ot = OT_BYTE;
-        else
-            ot = dflag ? OT_LONG : OT_WORD;
-        if (s->aflag)
-            offset_addr = insn_get(s, OT_LONG);
-        else
-            offset_addr = insn_get(s, OT_WORD);
-        gen_op_movl_A0_im(offset_addr);
-        /* handle override */
         {
-            int override, must_add_seg;
-            must_add_seg = s->addseg;
-            if (s->override >= 0) {
-                override = s->override;
-                must_add_seg = 1;
-            } else {
-                override = R_DS;
+            target_ulong offset_addr;
+
+            if ((b & 1) == 0)
+                ot = OT_BYTE;
+            else
+                ot = dflag + OT_WORD;
+#ifdef TARGET_X86_64
+            if (s->aflag == 2) {
+                offset_addr = ldq_code(s->pc);
+                s->pc += 8;
+                if (offset_addr == (int32_t)offset_addr)
+                    gen_op_movq_A0_im(offset_addr);
+                else
+                    gen_op_movq_A0_im64(offset_addr >> 32, offset_addr);
+            } else 
+#endif
+            {
+                if (s->aflag) {
+                    offset_addr = insn_get(s, OT_LONG);
+                } else {
+                    offset_addr = insn_get(s, OT_WORD);
+                }
+                gen_op_movl_A0_im(offset_addr);
             }
-            if (must_add_seg) {
-                gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
+            gen_add_A0_ds_seg(s);
+            if ((b & 2) == 0) {
+                gen_op_ld_T0_A0[ot + s->mem_index]();
+                gen_op_mov_reg_T0[ot][R_EAX]();
+            } else {
+                gen_op_mov_TN_reg[ot][0][R_EAX]();
+                gen_op_st_T0_A0[ot + s->mem_index]();
             }
         }
-        if ((b & 2) == 0) {
-            gen_op_ld_T0_A0[ot + s->mem_index]();
-            gen_op_mov_reg_T0[ot][R_EAX]();
-        } else {
-            gen_op_mov_TN_reg[ot][0][R_EAX]();
-            gen_op_st_T0_A0[ot + s->mem_index]();
-        }
         break;
     case 0xd7: /* xlat */
-        gen_op_movl_A0_reg[R_EBX]();
-        gen_op_addl_A0_AL();
-        if (s->aflag == 0)
-            gen_op_andl_A0_ffff();
-        /* handle override */
+#ifdef TARGET_X86_64
+        if (s->aflag == 2) {
+            gen_op_movq_A0_reg[R_EBX]();
+            gen_op_addq_A0_AL();
+        } else 
+#endif
         {
-            int override, must_add_seg;
-            must_add_seg = s->addseg;
-            override = R_DS;
-            if (s->override >= 0) {
-                override = s->override;
-                must_add_seg = 1;
-            } else {
-                override = R_DS;
-            }
-            if (must_add_seg) {
-                gen_op_addl_A0_seg(offsetof(CPUX86State,segs[override].base));
-            }
+            gen_op_movl_A0_reg[R_EBX]();
+            gen_op_addl_A0_AL();
+            if (s->aflag == 0)
+                gen_op_andl_A0_ffff();
         }
+        gen_add_A0_ds_seg(s);
         gen_op_ldu_T0_A0[OT_BYTE + s->mem_index]();
         gen_op_mov_reg_T0[OT_BYTE][R_EAX]();
         break;
     case 0xb0 ... 0xb7: /* mov R, Ib */
         val = insn_get(s, OT_BYTE);
         gen_op_movl_T0_im(val);
-        gen_op_mov_reg_T0[OT_BYTE][b & 7]();
+        gen_op_mov_reg_T0[OT_BYTE][(b & 7) | REX_B(s)]();
         break;
     case 0xb8 ... 0xbf: /* mov R, Iv */
-        ot = dflag ? OT_LONG : OT_WORD;
-        val = insn_get(s, ot);
-        reg = OR_EAX + (b & 7);
-        gen_op_movl_T0_im(val);
-        gen_op_mov_reg_T0[ot][reg]();
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            uint64_t tmp;
+            /* 64 bit case */
+            tmp = ldq_code(s->pc);
+            s->pc += 8;
+            reg = (b & 7) | REX_B(s);
+            gen_movtl_T0_im(tmp);
+            gen_op_mov_reg_T0[OT_QUAD][reg]();
+        } else 
+#endif
+        {
+            ot = dflag ? OT_LONG : OT_WORD;
+            val = insn_get(s, ot);
+            reg = (b & 7) | REX_B(s);
+            gen_op_movl_T0_im(val);
+            gen_op_mov_reg_T0[ot][reg]();
+        }
         break;
 
     case 0x91 ... 0x97: /* xchg R, EAX */
-        ot = dflag ? OT_LONG : OT_WORD;
-        reg = b & 7;
+        ot = dflag + OT_WORD;
+        reg = (b & 7) | REX_B(s);
         rm = R_EAX;
         goto do_xchg_reg;
     case 0x86:
@@ -2617,12 +4144,12 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3) {
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
         do_xchg_reg:
             gen_op_mov_TN_reg[ot][0][reg]();
             gen_op_mov_TN_reg[ot][1][rm]();
@@ -2642,9 +4169,13 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         }
         break;
     case 0xc4: /* les Gv */
+        if (CODE64(s))
+            goto illegal_op;
         op = R_ES;
         goto do_lxx;
     case 0xc5: /* lds Gv */
+        if (CODE64(s))
+            goto illegal_op;
         op = R_DS;
         goto do_lxx;
     case 0x1b2: /* lss Gv */
@@ -2658,20 +4189,20 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     do_lxx:
         ot = dflag ? OT_LONG : OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
         gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
         gen_op_ld_T1_A0[ot + s->mem_index]();
-        gen_op_addl_A0_im(1 << (ot - OT_WORD + 1));
+        gen_add_A0_im(s, 1 << (ot - OT_WORD + 1));
         /* load the segment first to handle exceptions properly */
         gen_op_ldu_T0_A0[OT_WORD + s->mem_index]();
         gen_movl_seg_T0(s, op, pc_start - s->cs_base);
         /* then put the data */
         gen_op_mov_reg_T1[ot][reg]();
         if (s->is_jmp) {
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
         break;
@@ -2687,18 +4218,20 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             if ((b & 1) == 0)
                 ot = OT_BYTE;
             else
-                ot = dflag ? OT_LONG : OT_WORD;
+                ot = dflag + OT_WORD;
             
             modrm = ldub_code(s->pc++);
             mod = (modrm >> 6) & 3;
-            rm = modrm & 7;
             op = (modrm >> 3) & 7;
             
             if (mod != 3) {
+                if (shift == 2) {
+                    s->rip_offset = 1;
+                }
                 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                 opreg = OR_TMP0;
             } else {
-                opreg = rm + OR_EAX;
+                opreg = (modrm & 7) | REX_B(s);
             }
 
             /* simpler op */
@@ -2739,11 +4272,11 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         op = 1;
         shift = 0;
     do_shiftd:
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
-        reg = (modrm >> 3) & 7;
+        rm = (modrm & 7) | REX_B(s);
+        reg = ((modrm >> 3) & 7) | rex_r;
         
         if (mod != 3) {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
@@ -2755,12 +4288,15 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         
         if (shift) {
             val = ldub_code(s->pc++);
-            val &= 0x1f;
+            if (ot == OT_QUAD)
+                val &= 0x3f;
+            else
+                val &= 0x1f;
             if (val) {
                 if (mod == 3)
-                    gen_op_shiftd_T0_T1_im_cc[ot - OT_WORD][op](val);
+                    gen_op_shiftd_T0_T1_im_cc[ot][op](val);
                 else
-                    gen_op_shiftd_mem_T0_T1_im_cc[ot - OT_WORD][op](val);
+                    gen_op_shiftd_mem_T0_T1_im_cc[ot + s->mem_index][op](val);
                 if (op == 0 && ot != OT_WORD)
                     s->cc_op = CC_OP_SHLB + ot;
                 else
@@ -2770,9 +4306,9 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
             if (mod == 3)
-                gen_op_shiftd_T0_T1_ECX_cc[ot - OT_WORD][op]();
+                gen_op_shiftd_T0_T1_ECX_cc[ot][op]();
             else
-                gen_op_shiftd_mem_T0_T1_ECX_cc[ot - OT_WORD][op]();
+                gen_op_shiftd_mem_T0_T1_ECX_cc[ot + s->mem_index][op]();
             s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
         }
         if (mod == 3) {
@@ -2783,11 +4319,16 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         /************************/
         /* floats */
     case 0xd8 ... 0xdf: 
+        if (s->flags & (HF_EM_MASK | HF_TS_MASK)) {
+            /* if CR0.EM or CR0.TS are set, generate an FPU exception */
+            /* XXX: what to do if illegal op ? */
+            gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+            break;
+        }
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
         rm = modrm & 7;
         op = ((b & 7) << 3) | ((modrm >> 3) & 7);
-        
         if (mod != 3) {
             /* memory op */
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
@@ -2826,16 +4367,9 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             case 0x08: /* flds */
             case 0x0a: /* fsts */
             case 0x0b: /* fstps */
-            case 0x18: /* fildl */
-            case 0x1a: /* fistl */
-            case 0x1b: /* fistpl */
-            case 0x28: /* fldl */
-            case 0x2a: /* fstl */
-            case 0x2b: /* fstpl */
-            case 0x38: /* filds */
-            case 0x3a: /* fists */
-            case 0x3b: /* fistps */
-                
+            case 0x18 ... 0x1b: /* fildl, fisttpl, fistl, fistpl */
+            case 0x28 ... 0x2b: /* fldl, fisttpll, fstl, fstpl */
+            case 0x38 ... 0x3b: /* filds, fisttps, fists, fistps */
                 switch(op & 7) {
                 case 0:
                     switch(op >> 4) {
@@ -2854,6 +4388,20 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                         break;
                     }
                     break;
+                case 1:
+                    switch(op >> 4) {
+                    case 1:
+                        gen_op_fisttl_ST0_A0();
+                        break;
+                    case 2:
+                        gen_op_fisttll_ST0_A0();
+                        break;
+                    case 3:
+                    default:
+                        gen_op_fistt_ST0_A0();
+                    }
+                    gen_op_fpop();
+                    break;
                 default:
                     switch(op >> 4) {
                     case 0:
@@ -2930,11 +4478,18 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 gen_op_fmov_ST0_STN((opreg + 1) & 7);
                 break;
             case 0x09: /* fxchg sti */
+            case 0x29: /* fxchg4 sti, undocumented op */
+            case 0x39: /* fxchg7 sti, undocumented op */
                 gen_op_fxchg_ST0_STN(opreg);
                 break;
             case 0x0a: /* grp d9/2 */
                 switch(rm) {
                 case 0: /* fnop */
+                    /* check exceptions (FreeBSD FPU probe) */
+                    if (s->cc_op != CC_OP_DYNAMIC)
+                        gen_op_set_cc_op(s->cc_op);
+                    gen_jmp_im(pc_start - s->cs_base);
+                    gen_op_fwait();
                     break;
                 default:
                     goto illegal_op;
@@ -3071,10 +4626,13 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 }
                 break;
             case 0x02: /* fcom */
+            case 0x22: /* fcom2, undocumented op */
                 gen_op_fmov_FT0_STN(opreg);
                 gen_op_fcom_ST0_FT0();
                 break;
             case 0x03: /* fcomp */
+            case 0x23: /* fcomp3, undocumented op */
+            case 0x32: /* fcomp5, undocumented op */
                 gen_op_fmov_FT0_STN(opreg);
                 gen_op_fcom_ST0_FT0();
                 gen_op_fpop();
@@ -3123,10 +4681,16 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 gen_op_fcomi_ST0_FT0();
                 s->cc_op = CC_OP_EFLAGS;
                 break;
+            case 0x28: /* ffree sti */
+                gen_op_ffree_STN(opreg);
+                break; 
             case 0x2a: /* fst sti */
                 gen_op_fmov_STN_ST0(opreg);
                 break;
             case 0x2b: /* fstp sti */
+            case 0x0b: /* fstp1 sti, undocumented op */
+            case 0x3a: /* fstp8 sti, undocumented op */
+            case 0x3b: /* fstp9 sti, undocumented op */
                 gen_op_fmov_STN_ST0(opreg);
                 gen_op_fpop();
                 break;
@@ -3151,6 +4715,10 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                     goto illegal_op;
                 }
                 break;
+            case 0x38: /* ffreep sti, undocumented op */
+                gen_op_ffree_STN(opreg);
+                gen_op_fpop();
+                break;
             case 0x3c: /* df/4 */
                 switch(rm) {
                 case 0:
@@ -3176,10 +4744,28 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 gen_op_fpop();
                 s->cc_op = CC_OP_EFLAGS;
                 break;
+            case 0x10 ... 0x13: /* fcmovxx */
+            case 0x18 ... 0x1b:
+                {
+                    int op1;
+                    const static uint8_t fcmov_cc[8] = {
+                        (JCC_B << 1),
+                        (JCC_Z << 1),
+                        (JCC_BE << 1),
+                        (JCC_P << 1),
+                    };
+                    op1 = fcmov_cc[op & 3] | ((op >> 3) & 1);
+                    gen_setcc(s, op1);
+                    gen_op_fcmov_ST0_STN_T0(opreg);
+                }
+                break;
             default:
                 goto illegal_op;
             }
         }
+#ifdef USE_CODE_COPY
+        s->tb->cflags |= CF_TB_FP_USED;
+#endif
         break;
         /************************/
         /* string ops */
@@ -3189,7 +4775,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
 
         if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
             gen_repz_movs(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
@@ -3203,7 +4789,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
 
         if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
             gen_repz_stos(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
@@ -3216,7 +4802,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
             gen_repz_lods(s, ot, pc_start - s->cs_base, s->pc - s->cs_base);
         } else {
@@ -3228,7 +4814,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-                ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         if (prefixes & PREFIX_REPNZ) {
             gen_repz_scas(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 1);
         } else if (prefixes & PREFIX_REPZ) {
@@ -3244,7 +4830,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if ((b & 1) == 0)
             ot = OT_BYTE;
         else
-            ot = dflag ? OT_LONG : OT_WORD;
+            ot = dflag + OT_WORD;
         if (prefixes & PREFIX_REPNZ) {
             gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 1);
         } else if (prefixes & PREFIX_REPZ) {
@@ -3314,6 +4900,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         else
             ot = dflag ? OT_LONG : OT_WORD;
         gen_op_mov_TN_reg[OT_WORD][0][R_EDX]();
+        gen_op_andl_T0_ffff();
         gen_check_io(s, ot, 0, pc_start - s->cs_base);
         gen_op_in[ot]();
         gen_op_mov_reg_T1[ot][R_EAX]();
@@ -3325,6 +4912,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         else
             ot = dflag ? OT_LONG : OT_WORD;
         gen_op_mov_TN_reg[OT_WORD][0][R_EDX]();
+        gen_op_andl_T0_ffff();
         gen_check_io(s, ot, 0, pc_start - s->cs_base);
         gen_op_mov_TN_reg[ot][1][R_EAX]();
         gen_op_out[ot]();
@@ -3336,6 +4924,8 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         val = ldsw_code(s->pc);
         s->pc += 2;
         gen_pop_T0(s);
+        if (CODE64(s) && s->dflag)
+            s->dflag = 2;
         gen_stack_update(s, val + (2 << s->dflag));
         if (s->dflag == 0)
             gen_op_andl_T0_ffff();
@@ -3357,7 +4947,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         if (s->pe && !s->vm86) {
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
-            gen_op_jmp_im(pc_start - s->cs_base);
+            gen_jmp_im(pc_start - s->cs_base);
             gen_op_lret_protected(s->dflag, val);
         } else {
             gen_stack_A0(s);
@@ -3395,80 +4985,87 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         } else {
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
-            gen_op_jmp_im(pc_start - s->cs_base);
-            gen_op_iret_protected(s->dflag);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_op_iret_protected(s->dflag, s->pc - s->cs_base);
             s->cc_op = CC_OP_EFLAGS;
         }
         gen_eob(s);
         break;
     case 0xe8: /* call im */
         {
-            unsigned int next_eip;
-            ot = dflag ? OT_LONG : OT_WORD;
-            val = insn_get(s, ot);
+            if (dflag)
+                tval = (int32_t)insn_get(s, OT_LONG);
+            else
+                tval = (int16_t)insn_get(s, OT_WORD);
             next_eip = s->pc - s->cs_base;
-            val += next_eip;
+            tval += next_eip;
             if (s->dflag == 0)
-                val &= 0xffff;
-            gen_op_movl_T0_im(next_eip);
+                tval &= 0xffff;
+            gen_movtl_T0_im(next_eip);
             gen_push_T0(s);
-            gen_jmp(s, val);
+            gen_jmp(s, tval);
         }
         break;
     case 0x9a: /* lcall im */
         {
             unsigned int selector, offset;
-
+            
+            if (CODE64(s))
+                goto illegal_op;
             ot = dflag ? OT_LONG : OT_WORD;
             offset = insn_get(s, ot);
             selector = insn_get(s, OT_WORD);
             
             gen_op_movl_T0_im(selector);
-            gen_op_movl_T1_im(offset);
+            gen_op_movl_T1_imu(offset);
         }
         goto do_lcall;
-    case 0xe9: /* jmp */
-        ot = dflag ? OT_LONG : OT_WORD;
-        val = insn_get(s, ot);
-        val += s->pc - s->cs_base;
+    case 0xe9: /* jmp im */
+        if (dflag)
+            tval = (int32_t)insn_get(s, OT_LONG);
+        else
+            tval = (int16_t)insn_get(s, OT_WORD);
+        tval += s->pc - s->cs_base;
         if (s->dflag == 0)
-            val = val & 0xffff;
-        gen_jmp(s, val);
+            tval &= 0xffff;
+        gen_jmp(s, tval);
         break;
     case 0xea: /* ljmp im */
         {
             unsigned int selector, offset;
 
+            if (CODE64(s))
+                goto illegal_op;
             ot = dflag ? OT_LONG : OT_WORD;
             offset = insn_get(s, ot);
             selector = insn_get(s, OT_WORD);
             
             gen_op_movl_T0_im(selector);
-            gen_op_movl_T1_im(offset);
+            gen_op_movl_T1_imu(offset);
         }
         goto do_ljmp;
     case 0xeb: /* jmp Jb */
-        val = (int8_t)insn_get(s, OT_BYTE);
-        val += s->pc - s->cs_base;
+        tval = (int8_t)insn_get(s, OT_BYTE);
+        tval += s->pc - s->cs_base;
         if (s->dflag == 0)
-            val = val & 0xffff;
-        gen_jmp(s, val);
+            tval &= 0xffff;
+        gen_jmp(s, tval);
         break;
     case 0x70 ... 0x7f: /* jcc Jb */
-        val = (int8_t)insn_get(s, OT_BYTE);
+        tval = (int8_t)insn_get(s, OT_BYTE);
         goto do_jcc;
     case 0x180 ... 0x18f: /* jcc Jv */
         if (dflag) {
-            val = insn_get(s, OT_LONG);
+            tval = (int32_t)insn_get(s, OT_LONG);
         } else {
-            val = (int16_t)insn_get(s, OT_WORD); 
+            tval = (int16_t)insn_get(s, OT_WORD); 
         }
     do_jcc:
         next_eip = s->pc - s->cs_base;
-        val += next_eip;
+        tval += next_eip;
         if (s->dflag == 0)
-            val &= 0xffff;
-        gen_jcc(s, b, val, next_eip);
+            tval &= 0xffff;
+        gen_jcc(s, b, tval, next_eip);
         break;
 
     case 0x190 ... 0x19f: /* setcc Gv */
@@ -3477,16 +5074,16 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         gen_ldst_modrm(s, modrm, OT_BYTE, OR_TMP0, 1);
         break;
     case 0x140 ... 0x14f: /* cmov Gv, Ev */
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
         gen_setcc(s, b);
         if (mod != 3) {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_op_ld_T1_A0[ot + s->mem_index]();
         } else {
-            rm = modrm & 7;
+            rm = (modrm & 7) | REX_B(s);
             gen_op_mov_TN_reg[ot][1][rm]();
         }
         gen_op_cmov_reg_T1_T0[ot - OT_WORD][reg]();
@@ -3516,20 +5113,30 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                     gen_op_movw_eflags_T0_cpl0();
                 }
             } else {
-                if (s->dflag) {
-                    gen_op_movl_eflags_T0();
+                if (s->cpl <= s->iopl) {
+                    if (s->dflag) {
+                        gen_op_movl_eflags_T0_io();
+                    } else {
+                        gen_op_movw_eflags_T0_io();
+                    }
                 } else {
-                    gen_op_movw_eflags_T0();
+                    if (s->dflag) {
+                        gen_op_movl_eflags_T0();
+                    } else {
+                        gen_op_movw_eflags_T0();
+                    }
                 }
             }
             gen_pop_update(s);
             s->cc_op = CC_OP_EFLAGS;
             /* abort translation because TF flag may change */
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
         break;
     case 0x9e: /* sahf */
+        if (CODE64(s))
+            goto illegal_op;
         gen_op_mov_TN_reg[OT_BYTE][0][R_AH]();
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
@@ -3537,6 +5144,8 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x9f: /* lahf */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_movl_T0_eflags();
@@ -3570,12 +5179,13 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         /************************/
         /* bit operations */
     case 0x1ba: /* bt/bts/btr/btc Gv, im */
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
         op = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
+        rm = (modrm & 7) | REX_B(s);
         if (mod != 3) {
+            s->rip_offset = 1;
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             gen_op_ld_T0_A0[ot + s->mem_index]();
         } else {
@@ -3609,19 +5219,16 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
     case 0x1bb: /* btc */
         op = 3;
     do_btx:
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
+        rm = (modrm & 7) | REX_B(s);
         gen_op_mov_TN_reg[OT_LONG][1][reg]();
         if (mod != 3) {
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             /* specific case: we need to add a displacement */
-            if (ot == OT_WORD)
-                gen_op_add_bitw_A0_T1();
-            else
-                gen_op_add_bitl_A0_T1();
+            gen_op_add_bit_A0_T1[ot - OT_WORD]();
             gen_op_ld_T0_A0[ot + s->mem_index]();
         } else {
             gen_op_mov_TN_reg[ot][0][rm]();
@@ -3638,48 +5245,61 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         break;
     case 0x1bc: /* bsf */
     case 0x1bd: /* bsr */
-        ot = dflag ? OT_LONG : OT_WORD;
+        ot = dflag + OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
+        /* NOTE: in order to handle the 0 case, we must load the
+           result. It could be optimized with a generated jump */
+        gen_op_mov_TN_reg[ot][1][reg]();
         gen_op_bsx_T0_cc[ot - OT_WORD][b & 1]();
-        /* NOTE: we always write back the result. Intel doc says it is
-           undefined if T0 == 0 */
-        gen_op_mov_reg_T0[ot][reg]();
+        gen_op_mov_reg_T1[ot][reg]();
         s->cc_op = CC_OP_LOGICB + ot;
         break;
         /************************/
         /* bcd */
     case 0x27: /* daa */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_daa();
         s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x2f: /* das */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_das();
         s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x37: /* aaa */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_aaa();
         s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x3f: /* aas */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_aas();
         s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xd4: /* aam */
+        if (CODE64(s))
+            goto illegal_op;
         val = ldub_code(s->pc++);
         gen_op_aam(val);
         s->cc_op = CC_OP_LOGICB;
         break;
     case 0xd5: /* aad */
+        if (CODE64(s))
+            goto illegal_op;
         val = ldub_code(s->pc++);
         gen_op_aad(val);
         s->cc_op = CC_OP_LOGICB;
@@ -3687,8 +5307,21 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         /************************/
         /* misc */
     case 0x90: /* nop */
+        /* XXX: xchg + rex handling */
+        /* XXX: correct lock test for all insn */
+        if (prefixes & PREFIX_LOCK)
+            goto illegal_op;
         break;
     case 0x9b: /* fwait */
+        if ((s->flags & (HF_MP_MASK | HF_TS_MASK)) == 
+            (HF_MP_MASK | HF_TS_MASK)) {
+            gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_op_fwait();
+        }
         break;
     case 0xcc: /* int3 */
         gen_interrupt(s, EXCP03_INT3, pc_start - s->cs_base, s->pc - s->cs_base);
@@ -3702,12 +5335,21 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         }
         break;
     case 0xce: /* into */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
-        gen_op_into(s->pc - s->cs_base);
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_op_into(s->pc - pc_start);
         break;
     case 0xf1: /* icebp (undocumented, exits to external debugger) */
+#if 1
         gen_debug(s, pc_start - s->cs_base);
+#else
+        /* start debug */
+        tb_flush(cpu_single_env);
+        cpu_set_log(CPU_LOG_INT | CPU_LOG_TB_IN_ASM);
+#endif
         break;
     case 0xfa: /* cli */
         if (!s->vm86) {
@@ -3730,9 +5372,12 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             gen_sti:
                 gen_op_sti();
                 /* interruptions are enabled only the first insn after sti */
-                gen_op_set_inhibit_irq();
+                /* If several instructions disable interrupts, only the
+                   _first_ does it */
+                if (!(s->tb->flags & HF_INHIBIT_IRQ_MASK))
+                    gen_op_set_inhibit_irq();
                 /* give a chance to handle pending irqs */
-                gen_op_jmp_im(s->pc - s->cs_base);
+                gen_jmp_im(s->pc - s->cs_base);
                 gen_eob(s);
             } else {
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
@@ -3746,26 +5391,40 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         }
         break;
     case 0x62: /* bound */
+        if (CODE64(s))
+            goto illegal_op;
         ot = dflag ? OT_LONG : OT_WORD;
         modrm = ldub_code(s->pc++);
         reg = (modrm >> 3) & 7;
         mod = (modrm >> 6) & 3;
         if (mod == 3)
             goto illegal_op;
-        gen_op_mov_reg_T0[ot][reg]();
+        gen_op_mov_TN_reg[ot][0][reg]();
         gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        gen_jmp_im(pc_start - s->cs_base);
         if (ot == OT_WORD)
-            gen_op_boundw(pc_start - s->cs_base);
+            gen_op_boundw();
         else
-            gen_op_boundl(pc_start - s->cs_base);
+            gen_op_boundl();
         break;
     case 0x1c8 ... 0x1cf: /* bswap reg */
-        reg = b & 7;
-        gen_op_mov_TN_reg[OT_LONG][0][reg]();
-        gen_op_bswapl_T0();
-        gen_op_mov_reg_T0[OT_LONG][reg]();
+        reg = (b & 7) | REX_B(s);
+#ifdef TARGET_X86_64
+        if (dflag == 2) {
+            gen_op_mov_TN_reg[OT_QUAD][0][reg]();
+            gen_op_bswapq_T0();
+            gen_op_mov_reg_T0[OT_QUAD][reg]();
+        } else 
+#endif
+        {
+            gen_op_mov_TN_reg[OT_LONG][0][reg]();
+            gen_op_bswapl_T0();
+            gen_op_mov_reg_T0[OT_LONG][reg]();
+        }
         break;
     case 0xd6: /* salc */
+        if (CODE64(s))
+            goto illegal_op;
         if (s->cc_op != CC_OP_DYNAMIC)
             gen_op_set_cc_op(s->cc_op);
         gen_op_salc();
@@ -3777,13 +5436,34 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         /* FALL THRU */
     case 0xe2: /* loop */
     case 0xe3: /* jecxz */
-        val = (int8_t)insn_get(s, OT_BYTE);
-        next_eip = s->pc - s->cs_base;
-        val += next_eip;
-        if (s->dflag == 0)
-            val &= 0xffff;
-        gen_op_loop[s->aflag][b & 3](val, next_eip);
-        gen_eob(s);
+        {
+            int l1, l2;
+
+            tval = (int8_t)insn_get(s, OT_BYTE);
+            next_eip = s->pc - s->cs_base;
+            tval += next_eip;
+            if (s->dflag == 0)
+                tval &= 0xffff;
+            
+            l1 = gen_new_label();
+            l2 = gen_new_label();
+            b &= 3;
+            if (b == 3) {
+                gen_op_jz_ecx[s->aflag](l1);
+            } else {
+                gen_op_dec_ECX[s->aflag]();
+                if (b <= 1)
+                    gen_op_mov_T0_cc();
+                gen_op_loop[s->aflag][b](l1);
+            }
+
+            gen_jmp_im(next_eip);
+            gen_op_jmp_label(l2);
+            gen_set_label(l1);
+            gen_jmp_im(tval);
+            gen_set_label(l2);
+            gen_eob(s);
+        }
         break;
     case 0x130: /* wrmsr */
     case 0x132: /* rdmsr */
@@ -3797,8 +5477,67 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         }
         break;
     case 0x131: /* rdtsc */
+        gen_jmp_im(pc_start - s->cs_base);
         gen_op_rdtsc();
         break;
+    case 0x134: /* sysenter */
+        if (CODE64(s))
+            goto illegal_op;
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC) {
+                gen_op_set_cc_op(s->cc_op);
+                s->cc_op = CC_OP_DYNAMIC;
+            }
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_op_sysenter();
+            gen_eob(s);
+        }
+        break;
+    case 0x135: /* sysexit */
+        if (CODE64(s))
+            goto illegal_op;
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC) {
+                gen_op_set_cc_op(s->cc_op);
+                s->cc_op = CC_OP_DYNAMIC;
+            }
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_op_sysexit();
+            gen_eob(s);
+        }
+        break;
+#ifdef TARGET_X86_64
+    case 0x105: /* syscall */
+        /* XXX: is it usable in real mode ? */
+        if (s->cc_op != CC_OP_DYNAMIC) {
+            gen_op_set_cc_op(s->cc_op);
+            s->cc_op = CC_OP_DYNAMIC;
+        }
+        gen_jmp_im(pc_start - s->cs_base);
+        gen_op_syscall(s->pc - pc_start);
+        gen_eob(s);
+        break;
+    case 0x107: /* sysret */
+        if (!s->pe) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
+        } else {
+            if (s->cc_op != CC_OP_DYNAMIC) {
+                gen_op_set_cc_op(s->cc_op);
+                s->cc_op = CC_OP_DYNAMIC;
+            }
+            gen_jmp_im(pc_start - s->cs_base);
+            gen_op_sysret(s->dflag);
+            /* condition codes are modified only in long mode */
+            if (s->lma)
+                s->cc_op = CC_OP_EFLAGS;
+            gen_eob(s);
+        }
+        break;
+#endif
     case 0x1a2: /* cpuid */
         gen_op_cpuid();
         break;
@@ -3808,7 +5547,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         } else {
             if (s->cc_op != CC_OP_DYNAMIC)
                 gen_op_set_cc_op(s->cc_op);
-            gen_op_jmp_im(s->pc - s->cs_base);
+            gen_jmp_im(s->pc - s->cs_base);
             gen_op_hlt();
             s->is_jmp = 3;
         }
@@ -3834,7 +5573,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
             } else {
                 gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
-                gen_op_jmp_im(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
                 gen_op_lldt_T0();
             }
             break;
@@ -3854,7 +5593,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
             } else {
                 gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
-                gen_op_jmp_im(pc_start - s->cs_base);
+                gen_jmp_im(pc_start - s->cs_base);
                 gen_op_ltr_T0();
             }
             break;
@@ -3879,25 +5618,68 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         modrm = ldub_code(s->pc++);
         mod = (modrm >> 6) & 3;
         op = (modrm >> 3) & 7;
+        rm = modrm & 7;
         switch(op) {
         case 0: /* sgdt */
-        case 1: /* sidt */
             if (mod == 3)
                 goto illegal_op;
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
-            if (op == 0)
-                gen_op_movl_T0_env(offsetof(CPUX86State,gdt.limit));
-            else
-                gen_op_movl_T0_env(offsetof(CPUX86State,idt.limit));
+            gen_op_movl_T0_env(offsetof(CPUX86State, gdt.limit));
             gen_op_st_T0_A0[OT_WORD + s->mem_index]();
-            gen_op_addl_A0_im(2);
-            if (op == 0)
-                gen_op_movl_T0_env(offsetof(CPUX86State,gdt.base));
-            else
-                gen_op_movl_T0_env(offsetof(CPUX86State,idt.base));
+            gen_add_A0_im(s, 2);
+            gen_op_movtl_T0_env(offsetof(CPUX86State, gdt.base));
             if (!s->dflag)
                 gen_op_andl_T0_im(0xffffff);
-            gen_op_st_T0_A0[OT_LONG + s->mem_index]();
+            gen_op_st_T0_A0[CODE64(s) + OT_LONG + s->mem_index]();
+            break;
+        case 1:
+            if (mod == 3) {
+                switch (rm) {
+                case 0: /* monitor */
+                    if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
+                        s->cpl != 0)
+                        goto illegal_op;
+                    gen_jmp_im(pc_start - s->cs_base);
+#ifdef TARGET_X86_64
+                    if (s->aflag == 2) {
+                        gen_op_movq_A0_reg[R_EBX]();
+                        gen_op_addq_A0_AL();
+                    } else 
+#endif
+                    {
+                        gen_op_movl_A0_reg[R_EBX]();
+                        gen_op_addl_A0_AL();
+                        if (s->aflag == 0)
+                            gen_op_andl_A0_ffff();
+                    }
+                    gen_add_A0_ds_seg(s);
+                    gen_op_monitor();
+                    break;
+                case 1: /* mwait */
+                    if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
+                        s->cpl != 0)
+                        goto illegal_op;
+                    if (s->cc_op != CC_OP_DYNAMIC) {
+                        gen_op_set_cc_op(s->cc_op);
+                        s->cc_op = CC_OP_DYNAMIC;
+                    }
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_op_mwait();
+                    gen_eob(s);
+                    break;
+                default:
+                    goto illegal_op;
+                }
+            } else { /* sidt */
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_movl_T0_env(offsetof(CPUX86State, idt.limit));
+                gen_op_st_T0_A0[OT_WORD + s->mem_index]();
+                gen_add_A0_im(s, 2);
+                gen_op_movtl_T0_env(offsetof(CPUX86State, idt.base));
+                if (!s->dflag)
+                    gen_op_andl_T0_im(0xffffff);
+                gen_op_st_T0_A0[CODE64(s) + OT_LONG + s->mem_index]();
+            }
             break;
         case 2: /* lgdt */
         case 3: /* lidt */
@@ -3908,15 +5690,15 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             } else {
                 gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
                 gen_op_ld_T1_A0[OT_WORD + s->mem_index]();
-                gen_op_addl_A0_im(2);
-                gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                gen_add_A0_im(s, 2);
+                gen_op_ld_T0_A0[CODE64(s) + OT_LONG + s->mem_index]();
                 if (!s->dflag)
                     gen_op_andl_T0_im(0xffffff);
                 if (op == 2) {
-                    gen_op_movl_env_T0(offsetof(CPUX86State,gdt.base));
+                    gen_op_movtl_env_T0(offsetof(CPUX86State,gdt.base));
                     gen_op_movl_env_T1(offsetof(CPUX86State,gdt.limit));
                 } else {
-                    gen_op_movl_env_T0(offsetof(CPUX86State,idt.base));
+                    gen_op_movtl_env_T0(offsetof(CPUX86State,idt.base));
                     gen_op_movl_env_T1(offsetof(CPUX86State,idt.limit));
                 }
             }
@@ -3931,7 +5713,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             } else {
                 gen_ldst_modrm(s, modrm, OT_WORD, OR_TMP0, 0);
                 gen_op_lmsw_T0();
-                gen_op_jmp_im(s->pc - s->cs_base);
+                gen_jmp_im(s->pc - s->cs_base);
                 gen_eob(s);
             }
             break;
@@ -3939,40 +5721,93 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             if (s->cpl != 0) {
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
             } else {
-                if (mod == 3)
-                    goto illegal_op;
-                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
-                gen_op_invlpg_A0();
+                if (mod == 3) {
+#ifdef TARGET_X86_64
+                    if (CODE64(s) && rm == 0) {
+                        /* swapgs */
+                        gen_op_movtl_T0_env(offsetof(CPUX86State,segs[R_GS].base));
+                        gen_op_movtl_T1_env(offsetof(CPUX86State,kernelgsbase));
+                        gen_op_movtl_env_T1(offsetof(CPUX86State,segs[R_GS].base));
+                        gen_op_movtl_env_T0(offsetof(CPUX86State,kernelgsbase));
+                    } else 
+#endif
+                    {
+                        goto illegal_op;
+                    }
+                } else {
+                    gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                    gen_op_invlpg_A0();
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_eob(s);
+                }
             }
             break;
         default:
             goto illegal_op;
         }
         break;
-    case 0x63: /* arpl */
-        if (!s->pe || s->vm86)
-            goto illegal_op;
-        ot = dflag ? OT_LONG : OT_WORD;
-        modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
-        mod = (modrm >> 6) & 3;
-        rm = modrm & 7;
-        if (mod != 3) {
-            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
-            gen_op_ld_T0_A0[ot + s->mem_index]();
+    case 0x108: /* invd */
+    case 0x109: /* wbinvd */
+        if (s->cpl != 0) {
+            gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            gen_op_mov_TN_reg[ot][0][rm]();
+            /* nothing to do */
         }
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_op_arpl();
-        s->cc_op = CC_OP_EFLAGS;
-        if (mod != 3) {
-            gen_op_st_T0_A0[ot + s->mem_index]();
-        } else {
-            gen_op_mov_reg_T0[ot][rm]();
+        break;
+    case 0x63: /* arpl or movslS (x86_64) */
+#ifdef TARGET_X86_64
+        if (CODE64(s)) {
+            int d_ot;
+            /* d_ot is the size of destination */
+            d_ot = dflag + OT_WORD;
+
+            modrm = ldub_code(s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            mod = (modrm >> 6) & 3;
+            rm = (modrm & 7) | REX_B(s);
+            
+            if (mod == 3) {
+                gen_op_mov_TN_reg[OT_LONG][0][rm]();
+                /* sign extend */
+                if (d_ot == OT_QUAD)
+                    gen_op_movslq_T0_T0();
+                gen_op_mov_reg_T0[d_ot][reg]();
+            } else {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                if (d_ot == OT_QUAD) {
+                    gen_op_lds_T0_A0[OT_LONG + s->mem_index]();
+                } else {
+                    gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                }
+                gen_op_mov_reg_T0[d_ot][reg]();
+            }
+        } else 
+#endif
+        {
+            if (!s->pe || s->vm86)
+                goto illegal_op;
+            ot = dflag ? OT_LONG : OT_WORD;
+            modrm = ldub_code(s->pc++);
+            reg = (modrm >> 3) & 7;
+            mod = (modrm >> 6) & 3;
+            rm = modrm & 7;
+            if (mod != 3) {
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+                gen_op_ld_T0_A0[ot + s->mem_index]();
+            } else {
+                gen_op_mov_TN_reg[ot][0][rm]();
+            }
+            if (s->cc_op != CC_OP_DYNAMIC)
+                gen_op_set_cc_op(s->cc_op);
+            gen_op_arpl();
+            s->cc_op = CC_OP_EFLAGS;
+            if (mod != 3) {
+                gen_op_st_T0_A0[ot + s->mem_index]();
+            } else {
+                gen_op_mov_reg_T0[ot][rm]();
+            }
+            gen_op_arpl_update();
         }
-        gen_op_arpl_update();
         break;
     case 0x102: /* lar */
     case 0x103: /* lsl */
@@ -3980,7 +5815,7 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             goto illegal_op;
         ot = dflag ? OT_LONG : OT_WORD;
         modrm = ldub_code(s->pc++);
-        reg = (modrm >> 3) & 7;
+        reg = ((modrm >> 3) & 7) | rex_r;
         gen_ldst_modrm(s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_TN_reg[ot][1][reg]();
         if (s->cc_op != CC_OP_DYNAMIC)
@@ -4006,10 +5841,15 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
             /* nothing more to do */
             break;
-        default:
-            goto illegal_op;
+        default: /* nop (multi byte) */
+            gen_nop_modrm(s, modrm);
+            break;
         }
         break;
+    case 0x119 ... 0x11f: /* nop (multi byte) */
+        modrm = ldub_code(s->pc++);
+        gen_nop_modrm(s, modrm);
+        break;
     case 0x120: /* mov reg, crN */
     case 0x122: /* mov crN, reg */
         if (s->cpl != 0) {
@@ -4018,21 +5858,31 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             modrm = ldub_code(s->pc++);
             if ((modrm & 0xc0) != 0xc0)
                 goto illegal_op;
-            rm = modrm & 7;
-            reg = (modrm >> 3) & 7;
+            rm = (modrm & 7) | REX_B(s);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            if (CODE64(s))
+                ot = OT_QUAD;
+            else
+                ot = OT_LONG;
             switch(reg) {
             case 0:
             case 2:
             case 3:
             case 4:
+            case 8:
                 if (b & 2) {
-                    gen_op_mov_TN_reg[OT_LONG][0][rm]();
+                    gen_op_mov_TN_reg[ot][0][rm]();
                     gen_op_movl_crN_T0(reg);
-                    gen_op_jmp_im(s->pc - s->cs_base);
+                    gen_jmp_im(s->pc - s->cs_base);
                     gen_eob(s);
                 } else {
-                    gen_op_movl_T0_env(offsetof(CPUX86State,cr[reg]));
-                    gen_op_mov_reg_T0[OT_LONG][rm]();
+#if !defined(CONFIG_USER_ONLY) 
+                    if (reg == 8)
+                        gen_op_movtl_T0_cr8();
+                    else
+#endif
+                        gen_op_movtl_T0_env(offsetof(CPUX86State,cr[reg]));
+                    gen_op_mov_reg_T0[ot][rm]();
                 }
                 break;
             default:
@@ -4048,19 +5898,23 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             modrm = ldub_code(s->pc++);
             if ((modrm & 0xc0) != 0xc0)
                 goto illegal_op;
-            rm = modrm & 7;
-            reg = (modrm >> 3) & 7;
+            rm = (modrm & 7) | REX_B(s);
+            reg = ((modrm >> 3) & 7) | rex_r;
+            if (CODE64(s))
+                ot = OT_QUAD;
+            else
+                ot = OT_LONG;
             /* XXX: do it dynamically with CR4.DE bit */
-            if (reg == 4 || reg == 5)
+            if (reg == 4 || reg == 5 || reg >= 8)
                 goto illegal_op;
             if (b & 2) {
-                gen_op_mov_TN_reg[OT_LONG][0][rm]();
+                gen_op_mov_TN_reg[ot][0][rm]();
                 gen_op_movl_drN_T0(reg);
-                gen_op_jmp_im(s->pc - s->cs_base);
+                gen_jmp_im(s->pc - s->cs_base);
                 gen_eob(s);
             } else {
-                gen_op_movl_T0_env(offsetof(CPUX86State,dr[reg]));
-                gen_op_mov_reg_T0[OT_LONG][rm]();
+                gen_op_movtl_T0_env(offsetof(CPUX86State,dr[reg]));
+                gen_op_mov_reg_T0[ot][rm]();
             }
         }
         break;
@@ -4069,7 +5923,114 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
             gen_op_clts();
+            /* abort block because static cpu state changed */
+            gen_jmp_im(s->pc - s->cs_base);
+            gen_eob(s);
+        }
+        break;
+    /* MMX/SSE/SSE2/PNI support */
+    case 0x1c3: /* MOVNTI reg, mem */
+        if (!(s->cpuid_features & CPUID_SSE2))
+            goto illegal_op;
+        ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        if (mod == 3)
+            goto illegal_op;
+        reg = ((modrm >> 3) & 7) | rex_r;
+        /* generate a generic store */
+        gen_ldst_modrm(s, modrm, ot, reg, 1);
+        break;
+    case 0x1ae:
+        modrm = ldub_code(s->pc++);
+        mod = (modrm >> 6) & 3;
+        op = (modrm >> 3) & 7;
+        switch(op) {
+        case 0: /* fxsave */
+            if (mod == 3 || !(s->cpuid_features & CPUID_FXSR) || 
+                (s->flags & HF_EM_MASK))
+                goto illegal_op;
+            if (s->flags & HF_TS_MASK) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_fxsave_A0((s->dflag == 2));
+            break;
+        case 1: /* fxrstor */
+            if (mod == 3 || !(s->cpuid_features & CPUID_FXSR) || 
+                (s->flags & HF_EM_MASK))
+                goto illegal_op;
+            if (s->flags & HF_TS_MASK) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            gen_op_fxrstor_A0((s->dflag == 2));
+            break;
+        case 2: /* ldmxcsr */
+        case 3: /* stmxcsr */
+            if (s->flags & HF_TS_MASK) {
+                gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
+                break;
+            }
+            if ((s->flags & HF_EM_MASK) || !(s->flags & HF_OSFXSR_MASK) ||
+                mod == 3)
+                goto illegal_op;
+            gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            if (op == 2) {
+                gen_op_ld_T0_A0[OT_LONG + s->mem_index]();
+                gen_op_movl_env_T0(offsetof(CPUX86State, mxcsr));
+            } else {
+                gen_op_movl_T0_env(offsetof(CPUX86State, mxcsr));
+                gen_op_st_T0_A0[OT_LONG + s->mem_index]();
+            }
+            break;
+        case 5: /* lfence */
+        case 6: /* mfence */
+            if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE))
+                goto illegal_op;
+            break;
+        case 7: /* sfence / clflush */
+            if ((modrm & 0xc7) == 0xc0) {
+                /* sfence */
+                if (!(s->cpuid_features & CPUID_SSE))
+                    goto illegal_op;
+            } else {
+                /* clflush */
+                if (!(s->cpuid_features & CPUID_CLFLUSH))
+                    goto illegal_op;
+                gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+            }
+            break;
+        default:
+            goto illegal_op;
+        }
+        break;
+    case 0x10d: /* prefetch */
+        modrm = ldub_code(s->pc++);
+        gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+        /* ignore for now */
+        break;
+    case 0x1aa: /* rsm */
+        if (!(s->flags & HF_SMM_MASK))
+            goto illegal_op;
+        if (s->cc_op != CC_OP_DYNAMIC) {
+            gen_op_set_cc_op(s->cc_op);
+            s->cc_op = CC_OP_DYNAMIC;
         }
+        gen_jmp_im(s->pc - s->cs_base);
+        gen_op_rsm();
+        gen_eob(s);
+        break;
+    case 0x110 ... 0x117:
+    case 0x128 ... 0x12f:
+    case 0x150 ... 0x177:
+    case 0x17c ... 0x17f:
+    case 0x1c2:
+    case 0x1c4 ... 0x1c6:
+    case 0x1d0 ... 0x1fe:
+        gen_sse(s, b, pc_start, rex_r);
         break;
     default:
         goto illegal_op;
@@ -4079,6 +6040,8 @@ static uint8_t *disas_insn(DisasContext *s, uint8_t *pc_start)
         gen_op_unlock();
     return s->pc;
  illegal_op:
+    if (s->prefix & PREFIX_LOCK)
+        gen_op_unlock();
     /* XXX: ensure that no lock was generated */
     gen_exception(s, EXCP06_ILLOP, pc_start - s->cs_base);
     return s->pc;
@@ -4094,20 +6057,6 @@ static uint16_t opc_read_flags[NB_OPS] = {
     [INDEX_op_das] = CC_A | CC_C,
     [INDEX_op_daa] = CC_A | CC_C,
 
-    [INDEX_op_adcb_T0_T1_cc] = CC_C,
-    [INDEX_op_adcw_T0_T1_cc] = CC_C,
-    [INDEX_op_adcl_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbb_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbw_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbl_T0_T1_cc] = CC_C,
-
-    [INDEX_op_adcb_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_adcw_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_adcl_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbb_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbw_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_sbbl_mem_T0_T1_cc] = CC_C,
-
     /* subtle: due to the incl/decl implementation, C is used */
     [INDEX_op_update_inc_cc] = CC_C, 
 
@@ -4179,19 +6128,59 @@ static uint16_t opc_read_flags[NB_OPS] = {
     [INDEX_op_cmc] = CC_C,
     [INDEX_op_salc] = CC_C,
 
-    [INDEX_op_rclb_T0_T1_cc] = CC_C,
-    [INDEX_op_rclw_T0_T1_cc] = CC_C,
-    [INDEX_op_rcll_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrb_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrw_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrl_T0_T1_cc] = CC_C,
-
-    [INDEX_op_rclb_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_rclw_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_rcll_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrb_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrw_mem_T0_T1_cc] = CC_C,
-    [INDEX_op_rcrl_mem_T0_T1_cc] = CC_C,
+    /* needed for correct flag optimisation before string ops */
+    [INDEX_op_jnz_ecxw] = CC_OSZAPC,
+    [INDEX_op_jnz_ecxl] = CC_OSZAPC,
+    [INDEX_op_jz_ecxw] = CC_OSZAPC,
+    [INDEX_op_jz_ecxl] = CC_OSZAPC,
+
+#ifdef TARGET_X86_64
+    [INDEX_op_jb_subq] = CC_C,
+    [INDEX_op_jz_subq] = CC_Z,
+    [INDEX_op_jbe_subq] = CC_Z | CC_C,
+    [INDEX_op_js_subq] = CC_S,
+    [INDEX_op_jl_subq] = CC_O | CC_S,
+    [INDEX_op_jle_subq] = CC_O | CC_S | CC_Z,
+
+    [INDEX_op_loopnzq] = CC_Z,
+    [INDEX_op_loopzq] = CC_Z,
+
+    [INDEX_op_setb_T0_subq] = CC_C,
+    [INDEX_op_setz_T0_subq] = CC_Z,
+    [INDEX_op_setbe_T0_subq] = CC_Z | CC_C,
+    [INDEX_op_sets_T0_subq] = CC_S,
+    [INDEX_op_setl_T0_subq] = CC_O | CC_S,
+    [INDEX_op_setle_T0_subq] = CC_O | CC_S | CC_Z,
+
+    [INDEX_op_jnz_ecxq] = CC_OSZAPC,
+    [INDEX_op_jz_ecxq] = CC_OSZAPC,
+#endif
+
+#define DEF_READF(SUFFIX)\
+    [INDEX_op_adcb ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_adcw ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_adcl ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    X86_64_DEF([INDEX_op_adcq ## SUFFIX ## _T0_T1_cc] = CC_C,)\
+    [INDEX_op_sbbb ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_sbbw ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_sbbl ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    X86_64_DEF([INDEX_op_sbbq ## SUFFIX ## _T0_T1_cc] = CC_C,)\
+\
+    [INDEX_op_rclb ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_rclw ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_rcll ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    X86_64_DEF([INDEX_op_rclq ## SUFFIX ## _T0_T1_cc] = CC_C,)\
+    [INDEX_op_rcrb ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_rcrw ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    [INDEX_op_rcrl ## SUFFIX ## _T0_T1_cc] = CC_C,\
+    X86_64_DEF([INDEX_op_rcrq ## SUFFIX ## _T0_T1_cc] = CC_C,)
+
+    DEF_READF( )
+    DEF_READF(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_READF(_kernel)
+    DEF_READF(_user)
+#endif
 };
 
 /* flags written by an operation */
@@ -4204,29 +6193,24 @@ static uint16_t opc_write_flags[NB_OPS] = {
     [INDEX_op_update_inc_cc] = CC_OSZAPC, 
     [INDEX_op_testl_T0_T1_cc] = CC_OSZAPC,
 
-    [INDEX_op_adcb_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_adcw_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_adcl_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbb_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbw_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbl_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_adcb_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_adcw_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_adcl_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbb_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbw_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sbbl_mem_T0_T1_cc] = CC_OSZAPC,
-
     [INDEX_op_mulb_AL_T0] = CC_OSZAPC,
-    [INDEX_op_imulb_AL_T0] = CC_OSZAPC,
     [INDEX_op_mulw_AX_T0] = CC_OSZAPC,
-    [INDEX_op_imulw_AX_T0] = CC_OSZAPC,
     [INDEX_op_mull_EAX_T0] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_mulq_EAX_T0] = CC_OSZAPC,)
+    [INDEX_op_imulb_AL_T0] = CC_OSZAPC,
+    [INDEX_op_imulw_AX_T0] = CC_OSZAPC,
     [INDEX_op_imull_EAX_T0] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_imulq_EAX_T0] = CC_OSZAPC,)
     [INDEX_op_imulw_T0_T1] = CC_OSZAPC,
     [INDEX_op_imull_T0_T1] = CC_OSZAPC,
-    
+    X86_64_DEF([INDEX_op_imulq_T0_T1] = CC_OSZAPC,)
+
+    /* sse */
+    [INDEX_op_ucomiss] = CC_OSZAPC,
+    [INDEX_op_ucomisd] = CC_OSZAPC,
+    [INDEX_op_comiss] = CC_OSZAPC,
+    [INDEX_op_comisd] = CC_OSZAPC,
+
     /* bcd */
     [INDEX_op_aam] = CC_OSZAPC,
     [INDEX_op_aad] = CC_OSZAPC,
@@ -4238,109 +6222,116 @@ static uint16_t opc_write_flags[NB_OPS] = {
     [INDEX_op_movb_eflags_T0] = CC_S | CC_Z | CC_A | CC_P | CC_C,
     [INDEX_op_movw_eflags_T0] = CC_OSZAPC,
     [INDEX_op_movl_eflags_T0] = CC_OSZAPC,
+    [INDEX_op_movw_eflags_T0_io] = CC_OSZAPC,
+    [INDEX_op_movl_eflags_T0_io] = CC_OSZAPC,
+    [INDEX_op_movw_eflags_T0_cpl0] = CC_OSZAPC,
+    [INDEX_op_movl_eflags_T0_cpl0] = CC_OSZAPC,
     [INDEX_op_clc] = CC_C,
     [INDEX_op_stc] = CC_C,
     [INDEX_op_cmc] = CC_C,
 
-    [INDEX_op_rolb_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rolw_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_roll_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorb_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorw_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorl_T0_T1_cc] = CC_O | CC_C,
-
-    [INDEX_op_rclb_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rclw_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcll_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrb_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrw_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrl_T0_T1_cc] = CC_O | CC_C,
-
-    [INDEX_op_shlb_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shlw_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shll_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_shrb_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shrw_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shrl_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_sarb_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sarw_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sarl_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_shldw_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shldl_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shldw_T0_T1_im_cc] = CC_OSZAPC,
-    [INDEX_op_shldl_T0_T1_im_cc] = CC_OSZAPC,
-
-    [INDEX_op_shrdw_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shrdl_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shrdw_T0_T1_im_cc] = CC_OSZAPC,
-    [INDEX_op_shrdl_T0_T1_im_cc] = CC_OSZAPC,
-
-    [INDEX_op_rolb_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rolw_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_roll_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorb_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorw_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rorl_mem_T0_T1_cc] = CC_O | CC_C,
-
-    [INDEX_op_rclb_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rclw_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcll_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrb_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrw_mem_T0_T1_cc] = CC_O | CC_C,
-    [INDEX_op_rcrl_mem_T0_T1_cc] = CC_O | CC_C,
-
-    [INDEX_op_shlb_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shlw_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shll_mem_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_shrb_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shrw_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_shrl_mem_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_sarb_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sarw_mem_T0_T1_cc] = CC_OSZAPC,
-    [INDEX_op_sarl_mem_T0_T1_cc] = CC_OSZAPC,
-
-    [INDEX_op_shldw_mem_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shldl_mem_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shldw_mem_T0_T1_im_cc] = CC_OSZAPC,
-    [INDEX_op_shldl_mem_T0_T1_im_cc] = CC_OSZAPC,
-
-    [INDEX_op_shrdw_mem_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shrdl_mem_T0_T1_ECX_cc] = CC_OSZAPC,
-    [INDEX_op_shrdw_mem_T0_T1_im_cc] = CC_OSZAPC,
-    [INDEX_op_shrdl_mem_T0_T1_im_cc] = CC_OSZAPC,
-
     [INDEX_op_btw_T0_T1_cc] = CC_OSZAPC,
     [INDEX_op_btl_T0_T1_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_btq_T0_T1_cc] = CC_OSZAPC,)
     [INDEX_op_btsw_T0_T1_cc] = CC_OSZAPC,
     [INDEX_op_btsl_T0_T1_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_btsq_T0_T1_cc] = CC_OSZAPC,)
     [INDEX_op_btrw_T0_T1_cc] = CC_OSZAPC,
     [INDEX_op_btrl_T0_T1_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_btrq_T0_T1_cc] = CC_OSZAPC,)
     [INDEX_op_btcw_T0_T1_cc] = CC_OSZAPC,
     [INDEX_op_btcl_T0_T1_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_btcq_T0_T1_cc] = CC_OSZAPC,)
 
     [INDEX_op_bsfw_T0_cc] = CC_OSZAPC,
     [INDEX_op_bsfl_T0_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_bsfq_T0_cc] = CC_OSZAPC,)
     [INDEX_op_bsrw_T0_cc] = CC_OSZAPC,
     [INDEX_op_bsrl_T0_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_bsrq_T0_cc] = CC_OSZAPC,)
 
     [INDEX_op_cmpxchgb_T0_T1_EAX_cc] = CC_OSZAPC,
     [INDEX_op_cmpxchgw_T0_T1_EAX_cc] = CC_OSZAPC,
     [INDEX_op_cmpxchgl_T0_T1_EAX_cc] = CC_OSZAPC,
-
-    [INDEX_op_cmpxchgb_mem_T0_T1_EAX_cc] = CC_OSZAPC,
-    [INDEX_op_cmpxchgw_mem_T0_T1_EAX_cc] = CC_OSZAPC,
-    [INDEX_op_cmpxchgl_mem_T0_T1_EAX_cc] = CC_OSZAPC,
+    X86_64_DEF([INDEX_op_cmpxchgq_T0_T1_EAX_cc] = CC_OSZAPC,)
 
     [INDEX_op_cmpxchg8b] = CC_Z,
     [INDEX_op_lar] = CC_Z,
     [INDEX_op_lsl] = CC_Z,
+    [INDEX_op_verr] = CC_Z,
+    [INDEX_op_verw] = CC_Z,
     [INDEX_op_fcomi_ST0_FT0] = CC_Z | CC_P | CC_C,
     [INDEX_op_fucomi_ST0_FT0] = CC_Z | CC_P | CC_C,
+
+#define DEF_WRITEF(SUFFIX)\
+    [INDEX_op_adcb ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_adcw ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_adcl ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_adcq ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,)\
+    [INDEX_op_sbbb ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_sbbw ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_sbbl ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_sbbq ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_rolb ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rolw ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_roll ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    X86_64_DEF([INDEX_op_rolq ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,)\
+    [INDEX_op_rorb ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rorw ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rorl ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    X86_64_DEF([INDEX_op_rorq ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,)\
+\
+    [INDEX_op_rclb ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rclw ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rcll ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    X86_64_DEF([INDEX_op_rclq ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,)\
+    [INDEX_op_rcrb ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rcrw ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    [INDEX_op_rcrl ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,\
+    X86_64_DEF([INDEX_op_rcrq ## SUFFIX ## _T0_T1_cc] = CC_O | CC_C,)\
+\
+    [INDEX_op_shlb ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_shlw ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_shll ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shlq ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_shrb ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_shrw ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_shrl ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shrq ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_sarb ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_sarw ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    [INDEX_op_sarl ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_sarq ## SUFFIX ## _T0_T1_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_shldw ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,\
+    [INDEX_op_shldl ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shldq ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,)\
+    [INDEX_op_shldw ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,\
+    [INDEX_op_shldl ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shldq ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_shrdw ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,\
+    [INDEX_op_shrdl ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shrdq ## SUFFIX ## _T0_T1_ECX_cc] = CC_OSZAPC,)\
+    [INDEX_op_shrdw ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,\
+    [INDEX_op_shrdl ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_shrdq ## SUFFIX ## _T0_T1_im_cc] = CC_OSZAPC,)\
+\
+    [INDEX_op_cmpxchgb ## SUFFIX ## _T0_T1_EAX_cc] = CC_OSZAPC,\
+    [INDEX_op_cmpxchgw ## SUFFIX ## _T0_T1_EAX_cc] = CC_OSZAPC,\
+    [INDEX_op_cmpxchgl ## SUFFIX ## _T0_T1_EAX_cc] = CC_OSZAPC,\
+    X86_64_DEF([INDEX_op_cmpxchgq ## SUFFIX ## _T0_T1_EAX_cc] = CC_OSZAPC,)
+
+
+    DEF_WRITEF( )
+    DEF_WRITEF(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_WRITEF(_kernel)
+    DEF_WRITEF(_user)
+#endif
 };
 
 /* simpler form of an operation if no flags need to be generated */
@@ -4352,33 +6343,39 @@ static uint16_t opc_simpler[NB_OPS] = {
     /* broken: CC_OP logic must be rewritten */
     [INDEX_op_update_inc_cc] = INDEX_op_nop,
 #endif
-    [INDEX_op_rolb_T0_T1_cc] = INDEX_op_rolb_T0_T1,
-    [INDEX_op_rolw_T0_T1_cc] = INDEX_op_rolw_T0_T1,
-    [INDEX_op_roll_T0_T1_cc] = INDEX_op_roll_T0_T1,
-
-    [INDEX_op_rorb_T0_T1_cc] = INDEX_op_rorb_T0_T1,
-    [INDEX_op_rorw_T0_T1_cc] = INDEX_op_rorw_T0_T1,
-    [INDEX_op_rorl_T0_T1_cc] = INDEX_op_rorl_T0_T1,
-
-    [INDEX_op_rolb_mem_T0_T1_cc] = INDEX_op_rolb_mem_T0_T1,
-    [INDEX_op_rolw_mem_T0_T1_cc] = INDEX_op_rolw_mem_T0_T1,
-    [INDEX_op_roll_mem_T0_T1_cc] = INDEX_op_roll_mem_T0_T1,
-
-    [INDEX_op_rorb_mem_T0_T1_cc] = INDEX_op_rorb_mem_T0_T1,
-    [INDEX_op_rorw_mem_T0_T1_cc] = INDEX_op_rorw_mem_T0_T1,
-    [INDEX_op_rorl_mem_T0_T1_cc] = INDEX_op_rorl_mem_T0_T1,
 
     [INDEX_op_shlb_T0_T1_cc] = INDEX_op_shlb_T0_T1,
     [INDEX_op_shlw_T0_T1_cc] = INDEX_op_shlw_T0_T1,
     [INDEX_op_shll_T0_T1_cc] = INDEX_op_shll_T0_T1,
+    X86_64_DEF([INDEX_op_shlq_T0_T1_cc] = INDEX_op_shlq_T0_T1,)
 
     [INDEX_op_shrb_T0_T1_cc] = INDEX_op_shrb_T0_T1,
     [INDEX_op_shrw_T0_T1_cc] = INDEX_op_shrw_T0_T1,
     [INDEX_op_shrl_T0_T1_cc] = INDEX_op_shrl_T0_T1,
+    X86_64_DEF([INDEX_op_shrq_T0_T1_cc] = INDEX_op_shrq_T0_T1,)
 
     [INDEX_op_sarb_T0_T1_cc] = INDEX_op_sarb_T0_T1,
     [INDEX_op_sarw_T0_T1_cc] = INDEX_op_sarw_T0_T1,
     [INDEX_op_sarl_T0_T1_cc] = INDEX_op_sarl_T0_T1,
+    X86_64_DEF([INDEX_op_sarq_T0_T1_cc] = INDEX_op_sarq_T0_T1,)
+
+#define DEF_SIMPLER(SUFFIX)\
+    [INDEX_op_rolb ## SUFFIX ## _T0_T1_cc] = INDEX_op_rolb ## SUFFIX ## _T0_T1,\
+    [INDEX_op_rolw ## SUFFIX ## _T0_T1_cc] = INDEX_op_rolw ## SUFFIX ## _T0_T1,\
+    [INDEX_op_roll ## SUFFIX ## _T0_T1_cc] = INDEX_op_roll ## SUFFIX ## _T0_T1,\
+    X86_64_DEF([INDEX_op_rolq ## SUFFIX ## _T0_T1_cc] = INDEX_op_rolq ## SUFFIX ## _T0_T1,)\
+\
+    [INDEX_op_rorb ## SUFFIX ## _T0_T1_cc] = INDEX_op_rorb ## SUFFIX ## _T0_T1,\
+    [INDEX_op_rorw ## SUFFIX ## _T0_T1_cc] = INDEX_op_rorw ## SUFFIX ## _T0_T1,\
+    [INDEX_op_rorl ## SUFFIX ## _T0_T1_cc] = INDEX_op_rorl ## SUFFIX ## _T0_T1,\
+    X86_64_DEF([INDEX_op_rorq ## SUFFIX ## _T0_T1_cc] = INDEX_op_rorq ## SUFFIX ## _T0_T1,)
+
+    DEF_SIMPLER( )
+    DEF_SIMPLER(_raw)
+#ifndef CONFIG_USER_ONLY
+    DEF_SIMPLER(_kernel)
+    DEF_SIMPLER(_user)
+#endif
 };
 
 void optimize_flags_init(void)
@@ -4426,18 +6423,19 @@ static inline int gen_intermediate_code_internal(CPUState *env,
                                                  int search_pc)
 {
     DisasContext dc1, *dc = &dc1;
-    uint8_t *pc_ptr;
+    target_ulong pc_ptr;
     uint16_t *gen_opc_end;
-    int flags, j, lj;
-    uint8_t *pc_start;
-    uint8_t *cs_base;
+    int flags, j, lj, cflags;
+    target_ulong pc_start;
+    target_ulong cs_base;
     
     /* generate intermediate code */
-    pc_start = (uint8_t *)tb->pc;
-    cs_base = (uint8_t *)tb->cs_base;
+    pc_start = tb->pc;
+    cs_base = tb->cs_base;
     flags = tb->flags;
-       
-    dc->pe = env->cr[0] & CR0_PE_MASK;
+    cflags = tb->cflags;
+
+    dc->pe = (flags >> HF_PE_SHIFT) & 1;
     dc->code32 = (flags >> HF_CS32_SHIFT) & 1;
     dc->ss32 = (flags >> HF_SS32_SHIFT) & 1;
     dc->addseg = (flags >> HF_ADDSEG_SHIFT) & 1;
@@ -4455,33 +6453,42 @@ static inline int gen_intermediate_code_internal(CPUState *env,
     dc->mem_index = 0;
     if (flags & HF_SOFTMMU_MASK) {
         if (dc->cpl == 3)
-            dc->mem_index = 6;
+            dc->mem_index = 2 * 4;
         else
-            dc->mem_index = 3;
+            dc->mem_index = 1 * 4;
     }
-    dc->jmp_opt = !(dc->tf || env->singlestep_enabled
+    dc->cpuid_features = env->cpuid_features;
+    dc->cpuid_ext_features = env->cpuid_ext_features;
+#ifdef TARGET_X86_64
+    dc->lma = (flags >> HF_LMA_SHIFT) & 1;
+    dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
+#endif
+    dc->flags = flags;
+    dc->jmp_opt = !(dc->tf || env->singlestep_enabled ||
+                    (flags & HF_INHIBIT_IRQ_MASK)
 #ifndef CONFIG_SOFTMMU
                     || (flags & HF_SOFTMMU_MASK)
 #endif
                     );
+#if 0
+    /* check addseg logic */
+    if (!dc->addseg && (dc->vm86 || !dc->pe || !dc->code32))
+        printf("ERROR addseg\n");
+#endif
+
     gen_opc_ptr = gen_opc_buf;
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     gen_opparam_ptr = gen_opparam_buf;
+    nb_gen_labels = 0;
 
     dc->is_jmp = DISAS_NEXT;
     pc_ptr = pc_start;
     lj = -1;
 
-    /* if irq were inhibited for the next instruction, we can disable
-       them here as it is simpler (otherwise jumps would have to
-       handled as special case) */
-    if (flags & HF_INHIBIT_IRQ_MASK) {
-        gen_op_reset_inhibit_irq();
-    }
     for(;;) {
         if (env->nb_breakpoints > 0) {
             for(j = 0; j < env->nb_breakpoints; j++) {
-                if (env->breakpoints[j] == (unsigned long)pc_ptr) {
+                if (env->breakpoints[j] == pc_ptr) {
                     gen_debug(dc, pc_ptr - dc->cs_base);
                     break;
                 }
@@ -4494,7 +6501,7 @@ static inline int gen_intermediate_code_internal(CPUState *env,
                 while (lj < j)
                     gen_opc_instr_start[lj++] = 0;
             }
-            gen_opc_pc[lj] = (uint32_t)pc_ptr;
+            gen_opc_pc[lj] = pc_ptr;
             gen_opc_cc_op[lj] = dc->cc_op;
             gen_opc_instr_start[lj] = 1;
         }
@@ -4504,15 +6511,20 @@ static inline int gen_intermediate_code_internal(CPUState *env,
             break;
         /* if single step mode, we generate only one instruction and
            generate an exception */
-        if (dc->tf || dc->singlestep_enabled) {
-            gen_op_jmp_im(pc_ptr - dc->cs_base);
+        /* if irq were inhibited with HF_INHIBIT_IRQ_MASK, we clear
+           the flag and abort the translation to give the irqs a
+           change to be happen */
+        if (dc->tf || dc->singlestep_enabled || 
+            (flags & HF_INHIBIT_IRQ_MASK) ||
+            (cflags & CF_SINGLE_INSN)) {
+            gen_jmp_im(pc_ptr - dc->cs_base);
             gen_eob(dc);
             break;
         }
         /* if too long translation, stop generation too */
         if (gen_opc_ptr >= gen_opc_end ||
             (pc_ptr - pc_start) >= (TARGET_PAGE_SIZE - 32)) {
-            gen_op_jmp_im(pc_ptr - dc->cs_base);
+            gen_jmp_im(pc_ptr - dc->cs_base);
             gen_eob(dc);
             break;
         }
@@ -4527,15 +6539,26 @@ static inline int gen_intermediate_code_internal(CPUState *env,
     }
         
 #ifdef DEBUG_DISAS
-    if (loglevel) {
+    if (loglevel & CPU_LOG_TB_CPU) {
+        cpu_dump_state(env, logfile, fprintf, X86_DUMP_CCOP);
+    }
+    if (loglevel & CPU_LOG_TB_IN_ASM) {
+        int disas_flags;
         fprintf(logfile, "----------------\n");
         fprintf(logfile, "IN: %s\n", lookup_symbol(pc_start));
-       disas(logfile, pc_start, pc_ptr - pc_start, 0, !dc->code32);
-        fprintf(logfile, "\n");
-
-        fprintf(logfile, "OP:\n");
-        dump_ops(gen_opc_buf, gen_opparam_buf);
+#ifdef TARGET_X86_64
+        if (dc->code64)
+            disas_flags = 2;
+        else
+#endif
+            disas_flags = !dc->code32;
+       target_disas(logfile, pc_start, pc_ptr - pc_start, disas_flags);
         fprintf(logfile, "\n");
+        if (loglevel & CPU_LOG_TB_OP) {
+            fprintf(logfile, "OP:\n");
+            dump_ops(gen_opc_buf, gen_opparam_buf);
+            fprintf(logfile, "\n");
+        }
     }
 #endif
 
@@ -4543,7 +6566,7 @@ static inline int gen_intermediate_code_internal(CPUState *env,
     optimize_flags(gen_opc_buf, gen_opc_ptr - gen_opc_buf);
 
 #ifdef DEBUG_DISAS
-    if (loglevel) {
+    if (loglevel & CPU_LOG_TB_OP_OPT) {
         fprintf(logfile, "AFTER FLAGS OPT:\n");
         dump_ops(gen_opc_buf, gen_opparam_buf);
         fprintf(logfile, "\n");