Index: src/blob.c
==================================================================
--- src/blob.c
+++ src/blob.c
@@ -1027,10 +1027,65 @@
     if( z[i]!='\r' ) z[j++] = z[i];
   }
   z[j] = 0;
   p->nUsed = j;
 }
+
+/*
+** Convert blob from cp1252 to utf-8. As cp1252 is a superset
+** of iso8895-1, this is useful on UNIX as well.
+**
+** This table contains the character translations for 0x80..0xA0.
+*/
+
+static const unsigned short cp1252[32] = {
+  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
+    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
+};
+
+void blob_cp1252_to_utf8(Blob *p){
+  unsigned char *z = (unsigned char *)p->aData;
+  int j   = p->nUsed;
+  int i, n;
+  for(i=n=0; i<j; i++){
+    if( z[i]>=0x80 ){
+      if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
+        n++;
+      }
+      n++;
+    }
+  }
+  j += n;
+  if( j>=p->nAlloc ){
+    blob_resize(p, j);
+    z = (unsigned char *)p->aData;
+  }
+  p->nUsed = j;
+  z[j] = 0;
+  while( j>i ){
+    if( z[--i]>=0x80 ){
+      if( z[i]<0xa0 ){
+        unsigned short sym = cp1252[z[i]&0x1f];
+        if( sym>=0x800 ){
+          z[--j] = 0x80 | (sym&0x3f);
+          z[--j] = 0x80 | ((sym>>6)&0x3f);
+          z[--j] = 0xe0 | (sym>>12);
+        }else{
+          z[--j] = 0x80 | (sym&0x3f);
+          z[--j] = 0xc0 | (sym>>6);
+        }
+      }else{
+        z[--j] = 0x80 | (z[i]&0x3F);
+        z[--j] = 0xC0 | (z[i]>>6);
+      }
+    }else{
+      z[--j] = z[i];
+    }
+  }
+}
 
 /*
 ** Shell-escape the given string.  Append the result to a blob.
 */
 void shell_escape(Blob *pBlob, const char *zIn){

Index: src/checkin.c
==================================================================
--- src/checkin.c
+++ src/checkin.c
@@ -908,10 +908,55 @@
   static int allOk = 0;   /* Set to true to disable this routine */
 
   if( allOk ) return 0;
   fUnicode = starts_with_utf16_bom(p, 0, 0);
   eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
+  if( eType<-2){
+    const char *zWarning;
+    const char *zDisable;
+    const char *zConvert;
+    Blob ans;
+    char cReply;
+
+    if(eType==-4){
+      if (binOk) goto go_on;
+      zWarning = "long lines";
+      zDisable = "\"binary-glob\" setting";
+      zConvert = "";
+    }else{
+      if (encodingOk) goto go_on;
+      zWarning = "invalid UTF-8";
+      zDisable = "\"encoding-glob\" setting";
+      zConvert = "c=convert/";
+    }
+    blob_zero(&ans);
+    file_relative_name(zFilename, &fname, 0);
+    zMsg = mprintf(
+         "%s appears to be text, but contains %s. Use --no-warnings or the"
+    	 " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ",
+         blob_str(&fname), zWarning, zDisable, zConvert);
+    prompt_user(zMsg, &ans);
+    fossil_free(zMsg);
+    cReply = blob_str(&ans)[0];
+    if( *zConvert && (cReply=='c' || cReply=='C') ){
+      char *zOrig = file_newname(zFilename, "original", 1);
+      FILE *f;
+      blob_write_to_file(p, zOrig);
+      fossil_free(zOrig);
+      f = fossil_fopen(zFilename, "wb");
+      blob_cp1252_to_utf8(p);
+      fwrite(blob_buffer(p), 1, blob_size(p), f);
+      fclose(f);
+      return 1;
+    } else if( cReply!='y' && cReply!='Y' ){
+      fossil_fatal("Abandoning commit due to %s in %s",
+                   zWarning, blob_str(&fname));
+    }
+    blob_reset(&ans);
+  go_on:
+    eType +=4 ;
+  }
   if( eType==0 || eType==-1 || fUnicode ){
     const char *zWarning;
     const char *zDisable;
     const char *zConvert = "c=convert/";
     Blob ans;

Index: src/diff.c
==================================================================
--- src/diff.c
+++ src/diff.c
@@ -57,11 +57,11 @@
     "more than 10,000 changes\n"
 
 #define DIFF_TOO_MANY_CHANGES_HTML \
     "<p class='generalError'>More than 10,000 changes</p>\n"
 
-#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
+#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0)
 #endif /* INTERFACE */
 
 /*
 ** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
 */
@@ -181,70 +181,103 @@
   /* Return results */
   *pnLine = nLine;
   return a;
 }
 
+/*
+** Macro which checks for proper UTF-8, when the first byte >= 0x80
+** It uses the method described in:
+**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
+** except for the "overlong form" which is not considered
+** invalid: Some languages like Java and Tcl use it.
+**
+** Any invalid byte causes bit 2 of result to be set (result |= 4),
+** otherwise for valid multibyte utf-8 sequences n, j and z are
+** updated so the continuation bytes are not checked again.
+ */
+#define CHECKUTF8(c) \
+if( c<0xC0 || c>=0xF8 ){ \
+  result |= 4;  /* Invalid 1-byte or multibyte UTF-8, continue */ \
+}else do{ \
+  /* Check if all continuation bytes >=0x80 and <0xC0 */ \
+  if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
+    result |= 4; /* Invalid continuation byte, continue */ \
+    break; \
+  }else{ \
+    /* prepare for checking remaining continuation bytes */ \
+    c<<=1; --n; ++j; ++z; \
+  } \
+}while( c>=0xC0 );
+
 /*
 ** This function attempts to scan each logical line within the blob to
 ** determine the type of content it appears to contain.  Possible return
 ** values are:
 **
 **  (1) -- The content appears to consist entirely of text, with lines
-**         delimited by line-feed characters; however, the encoding may
-**         not be UTF-8.
+**         delimited by line-feed characters.
 **
 **  (0) -- The content appears to be binary because it contains embedded
 **         NUL characters or an extremely long line.  Since this function
 **         does not understand UTF-16, it may falsely consider UTF-16 text
 **         to be binary.
 **
 ** (-1) -- The content appears to consist entirely of text, with lines
-**         delimited by carriage-return, line-feed pairs; however, the
-**         encoding may not be UTF-8.
+**         delimited by carriage-return, line-feed pairs.
+**
+** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII.
+**
+** (-4) -- The same as 0, but the determination is based on the fact that
+**         the blob might be text (any encoding) but it has a line length
+**         bigger than the diff logic in fossil can handle.
 **
 ************************************ WARNING **********************************
 **
-** This function does not validate that the blob content is properly formed
-** UTF-8.  It assumes that all code points are the same size.  It does not
-** validate any code points.  It makes no attempt to detect if any [invalid]
-** switches between UTF-8 and other encodings occur.
+** This function does not validate any code points.
 **
 ** The only code points that this function cares about are the NUL character,
 ** carriage-return, and line-feed.
 **
 ************************************ WARNING **********************************
 */
 int looks_like_utf8(const Blob *pContent){
-  const char *z = blob_buffer(pContent);
+  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
   unsigned int n = blob_size(pContent);
-  int j, c;
-  int result = 1;  /* Assume UTF-8 text with no CR/NL */
+  unsigned int j;
+  unsigned char c;
+  int result = 0;  /* Assume UTF-8 text with no CR/NL */
 
   /* Check individual lines.
   */
-  if( n==0 ) return result;  /* Empty file -> text */
+  if( n==0 ) return 1;  /* Empty file -> text */
   c = *z;
-  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
   j = (c!='\n');
+  if( c&0x80 ){
+    CHECKUTF8(c)
+  } else if( c==0 ){
+    return 0;  /* Zero byte in a file -> binary */
+  }
   while( --n>0 ){
     c = *++z; ++j;
-    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
-    if( c=='\n' ){
-      int c2 = z[-1];
-      if( c2=='\r' ){
-        result = -1;  /* Contains CR/NL, continue */
+    if( c&0x80 ){
+      CHECKUTF8(c)
+    } else if( c==0 ){
+      return 0;  /* Zero byte in a file -> binary */
+    } else if( c=='\n' ){
+      if( z[-1]=='\r' ){
+        result |= 2;  /* Contains CR/NL, continue */
       }
       if( j>LENGTH_MASK ){
-        return 0;  /* Very long line -> binary */
+        return -4;  /* Very long line -> binary */
       }
       j = 0;
     }
   }
   if( j>LENGTH_MASK ){
-    return 0;  /* Very long line -> binary */
+    return -4;  /* Very long line -> binary */
   }
-  return result;  /* No problems seen -> not binary */
+  return 1-result;  /* No problems seen -> not binary */
 }
 
 /*
 ** Define the type needed to represent a Unicode (UTF-16) character.
 */
@@ -288,10 +321,14 @@
 **         to be binary.
 **
 ** (-1) -- The content appears to consist entirely of text, with lines
 **         delimited by carriage-return, line-feed pairs; however, the
 **         encoding may not be UTF-16.
+**
+** (-4) -- The same as 0, but the determination is based on the fact that
+**         the blob might be text (any encoding) but it has a line length
+**         bigger than the diff logic in fossil can handle.
 **
 ************************************ WARNING **********************************
 **
 ** This function does not validate that the blob content is properly formed
 ** UTF-16.  It assumes that all code points are the same size.  It does not
@@ -323,17 +360,17 @@
       int c2 = z[-1];
       if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
         result = -1;  /* Contains CR/NL, continue */
       }
       if( j>UTF16_LENGTH_MASK ){
-        return 0;  /* Very long line -> binary */
+        return -4;  /* Very long line -> binary */
       }
       j = 0;
     }
   }
   if( j>UTF16_LENGTH_MASK ){
-    return 0;  /* Very long line -> binary */
+    return -4;  /* Very long line -> binary */
   }
   return result;  /* No problems seen -> not binary */
 }
 
 /*