Index: src/checkin.c ================================================================== --- src/checkin.c +++ src/checkin.c @@ -886,20 +886,20 @@ ** Issue a warning and give the user an opportunity to abandon out ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending ** is seen in a text file. */ static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ - int eType; /* return value of looks_like_text() */ + int eType; /* return value of looks_like_utf8/utf16() */ int fUnicode; /* return value of starts_with_utf16_bom() */ char *zMsg; /* Warning message */ Blob fname; /* Relative pathname of the file */ static int allOk = 0; /* Set to true to disable this routine */ if( allOk ) return; - eType = looks_like_text(p); fUnicode = starts_with_utf16_bom(p); - if( eType==-1 || fUnicode ){ + eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); + if( eType==0 || eType==-1 || fUnicode ){ const char *zWarning; Blob ans; char cReply; if( eType==-1 && fUnicode ){ @@ -907,10 +907,12 @@ }else if( eType==-1 ){ if( crnlOk ){ return; /* We don't want CR/NL warnings for this file. */ } zWarning = "CR/NL line endings"; + }else if( eType==0 ){ + zWarning = "binary data"; }else{ zWarning = "Unicode"; } file_relative_name(zFilename, &fname, 0); blob_zero(&ans); Index: src/diff.c ================================================================== --- src/diff.c +++ src/diff.c @@ -48,15 +48,15 @@ "cannot compute difference between binary files\n" #define DIFF_CANNOT_COMPUTE_SYMLINK \ "cannot compute difference between symlink and regular file\n" -#define looks_like_binary(blob) (looks_like_text((blob)) == 0) +#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) #endif /* INTERFACE */ /* -** Maximum length of a line in a text file. (8192) +** Maximum length of a line in a text file, in bytes. (8192) */ #define LENGTH_MASK_SZ 13 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) /* @@ -179,34 +179,34 @@ ** (1) -- The content appears to consist entirely of text, with lines ** delimited by line-feed characters; however, the encoding may ** not be UTF-8. ** ** (0) -- The content appears to be binary because it contains embedded -** NUL (\000) characters or an extremely long line. Since this -** function does not understand UTF-16, it may falsely consider -** UTF-16 text to be binary. +** NUL characters or an extremely long line. Since this function +** does not understand UTF-16, it may falsely consider UTF-16 text +** to be binary. ** ** (-1) -- The content appears to consist entirely of text, with lines ** delimited by carriage-return, line-feed pairs; however, the ** encoding may not be UTF-8. ** */ -int looks_like_text(const Blob *pContent){ +int looks_like_utf8(const Blob *pContent){ const char *z = blob_buffer(pContent); unsigned int n = blob_size(pContent); int j, c; - int result = 1; /* Assume text with no CR/NL */ + int result = 1; /* Assume UTF-8 text with no CR/NL */ /* Check individual lines. */ if( n==0 ) return result; /* Empty file -> text */ c = *z; - if( c==0 ) return 0; /* \000 byte in a file -> binary */ + if( c==0 ) return 0; /* Zero byte in a file -> binary */ j = (c!='\n'); while( --n>0 ){ c = *++z; ++j; - if( c==0 ) return 0; /* \000 byte in a file -> binary */ + if( c==0 ) return 0; /* Zero byte in a file -> binary */ if( c=='\n' ){ if( z[-1]=='\r' ){ result = -1; /* Contains CR/NL, continue */ } if( j>LENGTH_MASK ){ @@ -215,10 +215,78 @@ j = 0; } } if( j>LENGTH_MASK ){ return 0; /* Very long line -> binary */ + } + return result; /* No problems seen -> not binary */ +} + +/* +** Maximum length of a line in a text file, in UTF-16 characters. (4096) +** The number of bytes represented by this value cannot exceed LENGTH_MASK +** bytes, because that is the line buffer size by the diff engine. +*/ +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) + +/* +** The carriage-return / line-feed characters in the UTF-16be and UTF-16le +** encodings. +*/ +#define UTF16BE_CR ((wchar_t)'\r') +#define UTF16BE_LF ((wchar_t)'\n') +#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) +#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) + +/* +** This function attempts to scan each logical line within the blob to +** determine the type of content it appears to contain. Possible return +** values are: +** +** (1) -- The content appears to consist entirely of text, with lines +** delimited by line-feed characters; however, the encoding may +** not be UTF-16. +** +** (0) -- The content appears to be binary because it contains embedded +** NUL characters or an extremely long line. Since this function +** does not understand UTF-8, it may falsely consider UTF-8 text +** to be binary. +** +** (-1) -- The content appears to consist entirely of text, with lines +** delimited by carriage-return, line-feed pairs; however, the +** encoding may not be UTF-16. +** +*/ +int looks_like_utf16(const Blob *pContent){ + const wchar_t *z = (wchar_t *)blob_buffer(pContent); + unsigned int n = blob_size(pContent); + int j, c; + int result = 1; /* Assume UTF-16 text with no CR/NL */ + + /* Check individual lines. + */ + if( n==0 ) return result; /* Empty file -> text */ + if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ + c = *z; + if( c==0 ) return 0; /* NUL character in a file -> binary */ + j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); + while( (n-=2)>0 ){ + c = *++z; ++j; + if( c==0 ) return 0; /* NUL character in a file -> binary */ + if( c==UTF16BE_LF || c==UTF16LE_LF ){ + if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){ + result = -1; /* Contains CR/NL, continue */ + } + if( j>UTF16_LENGTH_MASK ){ + return 0; /* Very long line -> binary */ + } + j = 0; + } + } + if( j>UTF16_LENGTH_MASK ){ + return 0; /* Very long line -> binary */ } return result; /* No problems seen -> not binary */ } /*