Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch commitWarningV2 Excluding Merge-Ins
This is equivalent to a diff from ef6c243ed9 to 7d3a06b89a
2012-11-02
| ||
02:27 | Merge commit warning and file content type detection changes to trunk. check-in: 0c7c61447f user: mistachkin tags: trunk | |
2012-11-01
| ||
20:19 | Add detection of binary data with no leading UTF-16 byte-order-mark. Closed-Leaf check-in: 7d3a06b89a user: mistachkin tags: commitWarningV2 | |
20:09 | Improve detection of UTF-8, UTF-16, binary data, and carriage returns during commit operations. check-in: c837e44445 user: mistachkin tags: commitWarningV2 | |
12:32 | merge trunk check-in: 9e97de3410 user: jan.nijtmans tags: use-blob_strip_bom | |
11:48 | merge trunk let looks_like_text() give different values for UTF-16 BE/LE. Not used yet. check-in: 348637dedf user: jan.nijtmans tags: improve_looks_like_binary | |
10:20 | Restore Style fix, which got lost by [618258421767778c] check-in: ef6c243ed9 user: jan.nijtmans tags: trunk | |
07:40 | dont check for same BOM twice check-in: 8c32e6f0dd user: jan.nijtmans tags: trunk | |
Changes to src/checkin.c.
884 884 885 885 /* 886 886 ** Issue a warning and give the user an opportunity to abandon out 887 887 ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending 888 888 ** is seen in a text file. 889 889 */ 890 890 static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ 891 - int eType; /* return value of looks_like_text() */ 891 + int eType; /* return value of looks_like_utf8/utf16() */ 892 892 int fUnicode; /* return value of starts_with_utf16_bom() */ 893 893 char *zMsg; /* Warning message */ 894 894 Blob fname; /* Relative pathname of the file */ 895 895 static int allOk = 0; /* Set to true to disable this routine */ 896 896 897 897 if( allOk ) return; 898 - eType = looks_like_text(p); 899 898 fUnicode = starts_with_utf16_bom(p); 900 - if( eType==-1 || fUnicode ){ 899 + eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); 900 + if( eType==0 || eType==-1 || fUnicode ){ 901 901 const char *zWarning; 902 902 Blob ans; 903 903 char cReply; 904 904 905 905 if( eType==-1 && fUnicode ){ 906 906 zWarning = "Unicode and CR/NL line endings"; 907 907 }else if( eType==-1 ){ 908 908 if( crnlOk ){ 909 909 return; /* We don't want CR/NL warnings for this file. */ 910 910 } 911 911 zWarning = "CR/NL line endings"; 912 + }else if( eType==0 ){ 913 + zWarning = "binary data"; 912 914 }else{ 913 915 zWarning = "Unicode"; 914 916 } 915 917 file_relative_name(zFilename, &fname, 0); 916 918 blob_zero(&ans); 917 919 zMsg = mprintf( 918 920 "%s contains %s. commit anyhow (a=all/y/N)? ",
Changes to src/diff.c.
46 46 */ 47 47 #define DIFF_CANNOT_COMPUTE_BINARY \ 48 48 "cannot compute difference between binary files\n" 49 49 50 50 #define DIFF_CANNOT_COMPUTE_SYMLINK \ 51 51 "cannot compute difference between symlink and regular file\n" 52 52 53 -#define looks_like_binary(blob) (looks_like_text((blob)) == 0) 53 +#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) 54 54 #endif /* INTERFACE */ 55 55 56 56 /* 57 -** Maximum length of a line in a text file. (8192) 57 +** Maximum length of a line in a text file, in bytes. (8192) 58 58 */ 59 59 #define LENGTH_MASK_SZ 13 60 60 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) 61 61 62 62 /* 63 63 ** Information about each line of a file being diffed. 64 64 ** ................................................................................ 177 177 ** values are: 178 178 ** 179 179 ** (1) -- The content appears to consist entirely of text, with lines 180 180 ** delimited by line-feed characters; however, the encoding may 181 181 ** not be UTF-8. 182 182 ** 183 183 ** (0) -- The content appears to be binary because it contains embedded 184 -** NUL (\000) characters or an extremely long line. Since this 185 -** function does not understand UTF-16, it may falsely consider 186 -** UTF-16 text to be binary. 184 +** NUL characters or an extremely long line. Since this function 185 +** does not understand UTF-16, it may falsely consider UTF-16 text 186 +** to be binary. 187 187 ** 188 188 ** (-1) -- The content appears to consist entirely of text, with lines 189 189 ** delimited by carriage-return, line-feed pairs; however, the 190 190 ** encoding may not be UTF-8. 191 191 ** 192 192 */ 193 -int looks_like_text(const Blob *pContent){ 193 +int looks_like_utf8(const Blob *pContent){ 194 194 const char *z = blob_buffer(pContent); 195 195 unsigned int n = blob_size(pContent); 196 196 int j, c; 197 - int result = 1; /* Assume text with no CR/NL */ 197 + int result = 1; /* Assume UTF-8 text with no CR/NL */ 198 198 199 199 /* Check individual lines. 200 200 */ 201 201 if( n==0 ) return result; /* Empty file -> text */ 202 202 c = *z; 203 - if( c==0 ) return 0; /* \000 byte in a file -> binary */ 203 + if( c==0 ) return 0; /* Zero byte in a file -> binary */ 204 204 j = (c!='\n'); 205 205 while( --n>0 ){ 206 206 c = *++z; ++j; 207 - if( c==0 ) return 0; /* \000 byte in a file -> binary */ 207 + if( c==0 ) return 0; /* Zero byte in a file -> binary */ 208 208 if( c=='\n' ){ 209 209 if( z[-1]=='\r' ){ 210 210 result = -1; /* Contains CR/NL, continue */ 211 211 } 212 212 if( j>LENGTH_MASK ){ 213 213 return 0; /* Very long line -> binary */ 214 214 } 215 215 j = 0; 216 216 } 217 217 } 218 218 if( j>LENGTH_MASK ){ 219 219 return 0; /* Very long line -> binary */ 220 + } 221 + return result; /* No problems seen -> not binary */ 222 +} 223 + 224 +/* 225 +** Maximum length of a line in a text file, in UTF-16 characters. (4096) 226 +** The number of bytes represented by this value cannot exceed LENGTH_MASK 227 +** bytes, because that is the line buffer size by the diff engine. 228 +*/ 229 +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) 230 +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) 231 + 232 +/* 233 +** The carriage-return / line-feed characters in the UTF-16be and UTF-16le 234 +** encodings. 235 +*/ 236 +#define UTF16BE_CR ((wchar_t)'\r') 237 +#define UTF16BE_LF ((wchar_t)'\n') 238 +#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) 239 +#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) 240 + 241 +/* 242 +** This function attempts to scan each logical line within the blob to 243 +** determine the type of content it appears to contain. Possible return 244 +** values are: 245 +** 246 +** (1) -- The content appears to consist entirely of text, with lines 247 +** delimited by line-feed characters; however, the encoding may 248 +** not be UTF-16. 249 +** 250 +** (0) -- The content appears to be binary because it contains embedded 251 +** NUL characters or an extremely long line. Since this function 252 +** does not understand UTF-8, it may falsely consider UTF-8 text 253 +** to be binary. 254 +** 255 +** (-1) -- The content appears to consist entirely of text, with lines 256 +** delimited by carriage-return, line-feed pairs; however, the 257 +** encoding may not be UTF-16. 258 +** 259 +*/ 260 +int looks_like_utf16(const Blob *pContent){ 261 + const wchar_t *z = (wchar_t *)blob_buffer(pContent); 262 + unsigned int n = blob_size(pContent); 263 + int j, c; 264 + int result = 1; /* Assume UTF-16 text with no CR/NL */ 265 + 266 + /* Check individual lines. 267 + */ 268 + if( n==0 ) return result; /* Empty file -> text */ 269 + if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ 270 + c = *z; 271 + if( c==0 ) return 0; /* NUL character in a file -> binary */ 272 + j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); 273 + while( (n-=2)>0 ){ 274 + c = *++z; ++j; 275 + if( c==0 ) return 0; /* NUL character in a file -> binary */ 276 + if( c==UTF16BE_LF || c==UTF16LE_LF ){ 277 + if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){ 278 + result = -1; /* Contains CR/NL, continue */ 279 + } 280 + if( j>UTF16_LENGTH_MASK ){ 281 + return 0; /* Very long line -> binary */ 282 + } 283 + j = 0; 284 + } 285 + } 286 + if( j>UTF16_LENGTH_MASK ){ 287 + return 0; /* Very long line -> binary */ 220 288 } 221 289 return result; /* No problems seen -> not binary */ 222 290 } 223 291 224 292 /* 225 293 ** This function returns non-zero if the blob starts with a UTF-16le or 226 294 ** UTF-16be byte-order-mark (BOM).