Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch improve_commit_warning Excluding Merge-Ins
This is equivalent to a diff from a98467b661 to fdf9050c4b
2013-02-18
| ||
13:46 | Fixed ticket [5df2715635b99bd46a] (check-in count mismatch). check-in: b27c0d6d3f user: stephan tags: trunk | |
10:03 | New function fossil_utf8_to_filename, such that fossil_unicode_to_utf8/fossil_utf8_to_unicode/fossil_unicode_free are not used on UNIX/MAC any more: On UNIX those 3 functions were only no-ops, but this allows to re-implement then for real unicode <-> utf-8 conversions. There is an "#ifdef _WIN32" around those 3 functions and 2 more (fossil_mbcs_to... Leaf check-in: cc3976fd30 user: jan.nijtmans tags: fossil_utf8_to_filename | |
08:30 | merge trunk Leaf check-in: fdd51b617c user: jan.nijtmans tags: ticket-d17d6e5b17 | |
2013-02-17
| ||
21:37 | merge trunk Leaf check-in: fdf9050c4b user: jan.nijtmans tags: improve_commit_warning | |
14:47 | More simplification in UTF-16 bom detection Leaf check-in: 1e70f211f9 user: jan.nijtmans tags: utf16Bom | |
14:43 | Remove two unused variables check-in: a98467b661 user: jan.nijtmans tags: trunk | |
2013-02-16
| ||
14:12 | Limit the complexity of the diff display on check-in information pages. check-in: 4f95ea8c56 user: drh tags: trunk | |
2013-02-07
| ||
09:39 | merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning | |
Changes to src/blob.c.
1025 1025 z = p->aData; 1026 1026 for(i=j=0; z[i]; i++){ 1027 1027 if( z[i]!='\r' ) z[j++] = z[i]; 1028 1028 } 1029 1029 z[j] = 0; 1030 1030 p->nUsed = j; 1031 1031 } 1032 + 1033 +/* 1034 +** Convert blob from cp1252 to utf-8. As cp1252 is a superset 1035 +** of iso8895-1, this is useful on UNIX as well. 1036 +** 1037 +** This table contains the character translations for 0x80..0xA0. 1038 +*/ 1039 + 1040 +static const unsigned short cp1252[32] = { 1041 + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 1042 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, 1043 + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 1044 + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 1045 +}; 1046 + 1047 +void blob_cp1252_to_utf8(Blob *p){ 1048 + unsigned char *z = (unsigned char *)p->aData; 1049 + int j = p->nUsed; 1050 + int i, n; 1051 + for(i=n=0; i<j; i++){ 1052 + if( z[i]>=0x80 ){ 1053 + if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ 1054 + n++; 1055 + } 1056 + n++; 1057 + } 1058 + } 1059 + j += n; 1060 + if( j>=p->nAlloc ){ 1061 + blob_resize(p, j); 1062 + z = (unsigned char *)p->aData; 1063 + } 1064 + p->nUsed = j; 1065 + z[j] = 0; 1066 + while( j>i ){ 1067 + if( z[--i]>=0x80 ){ 1068 + if( z[i]<0xa0 ){ 1069 + unsigned short sym = cp1252[z[i]&0x1f]; 1070 + if( sym>=0x800 ){ 1071 + z[--j] = 0x80 | (sym&0x3f); 1072 + z[--j] = 0x80 | ((sym>>6)&0x3f); 1073 + z[--j] = 0xe0 | (sym>>12); 1074 + }else{ 1075 + z[--j] = 0x80 | (sym&0x3f); 1076 + z[--j] = 0xc0 | (sym>>6); 1077 + } 1078 + }else{ 1079 + z[--j] = 0x80 | (z[i]&0x3F); 1080 + z[--j] = 0xC0 | (z[i]>>6); 1081 + } 1082 + }else{ 1083 + z[--j] = z[i]; 1084 + } 1085 + } 1086 +} 1032 1087 1033 1088 /* 1034 1089 ** Shell-escape the given string. Append the result to a blob. 1035 1090 */ 1036 1091 void shell_escape(Blob *pBlob, const char *zIn){ 1037 1092 int n = blob_size(pBlob); 1038 1093 int k = strlen(zIn);
Changes to src/checkin.c.
906 906 char *zMsg; /* Warning message */ 907 907 Blob fname; /* Relative pathname of the file */ 908 908 static int allOk = 0; /* Set to true to disable this routine */ 909 909 910 910 if( allOk ) return 0; 911 911 fUnicode = starts_with_utf16_bom(p, 0, 0); 912 912 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); 913 + if( eType<-2){ 914 + const char *zWarning; 915 + const char *zDisable; 916 + const char *zConvert; 917 + Blob ans; 918 + char cReply; 919 + 920 + if(eType==-4){ 921 + if (binOk) goto go_on; 922 + zWarning = "long lines"; 923 + zDisable = "\"binary-glob\" setting"; 924 + zConvert = ""; 925 + }else{ 926 + if (encodingOk) goto go_on; 927 + zWarning = "invalid UTF-8"; 928 + zDisable = "\"encoding-glob\" setting"; 929 + zConvert = "c=convert/"; 930 + } 931 + blob_zero(&ans); 932 + file_relative_name(zFilename, &fname, 0); 933 + zMsg = mprintf( 934 + "%s appears to be text, but contains %s. Use --no-warnings or the" 935 + " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ", 936 + blob_str(&fname), zWarning, zDisable, zConvert); 937 + prompt_user(zMsg, &ans); 938 + fossil_free(zMsg); 939 + cReply = blob_str(&ans)[0]; 940 + if( *zConvert && (cReply=='c' || cReply=='C') ){ 941 + char *zOrig = file_newname(zFilename, "original", 1); 942 + FILE *f; 943 + blob_write_to_file(p, zOrig); 944 + fossil_free(zOrig); 945 + f = fossil_fopen(zFilename, "wb"); 946 + blob_cp1252_to_utf8(p); 947 + fwrite(blob_buffer(p), 1, blob_size(p), f); 948 + fclose(f); 949 + return 1; 950 + } else if( cReply!='y' && cReply!='Y' ){ 951 + fossil_fatal("Abandoning commit due to %s in %s", 952 + zWarning, blob_str(&fname)); 953 + } 954 + blob_reset(&ans); 955 + go_on: 956 + eType +=4 ; 957 + } 913 958 if( eType==0 || eType==-1 || fUnicode ){ 914 959 const char *zWarning; 915 960 const char *zDisable; 916 961 const char *zConvert = "c=convert/"; 917 962 Blob ans; 918 963 char cReply; 919 964
Changes to src/diff.c.
55 55 56 56 #define DIFF_TOO_MANY_CHANGES_TXT \ 57 57 "more than 10,000 changes\n" 58 58 59 59 #define DIFF_TOO_MANY_CHANGES_HTML \ 60 60 "<p class='generalError'>More than 10,000 changes</p>\n" 61 61 62 -#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) 62 +#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0) 63 63 #endif /* INTERFACE */ 64 64 65 65 /* 66 66 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) 67 67 */ 68 68 #define LENGTH_MASK_SZ 13 69 69 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) ................................................................................ 179 179 } 180 180 181 181 /* Return results */ 182 182 *pnLine = nLine; 183 183 return a; 184 184 } 185 185 186 +/* 187 +** Macro which checks for proper UTF-8, when the first byte >= 0x80 188 +** It uses the method described in: 189 +** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences 190 +** except for the "overlong form" which is not considered 191 +** invalid: Some languages like Java and Tcl use it. 192 +** 193 +** Any invalid byte causes bit 2 of result to be set (result |= 4), 194 +** otherwise for valid multibyte utf-8 sequences n, j and z are 195 +** updated so the continuation bytes are not checked again. 196 + */ 197 +#define CHECKUTF8(c) \ 198 +if( c<0xC0 || c>=0xF8 ){ \ 199 + result |= 4; /* Invalid 1-byte or multibyte UTF-8, continue */ \ 200 +}else do{ \ 201 + /* Check if all continuation bytes >=0x80 and <0xC0 */ \ 202 + if( n<2 || ((z[1]&0xC0)!=0x80) ){ \ 203 + result |= 4; /* Invalid continuation byte, continue */ \ 204 + break; \ 205 + }else{ \ 206 + /* prepare for checking remaining continuation bytes */ \ 207 + c<<=1; --n; ++j; ++z; \ 208 + } \ 209 +}while( c>=0xC0 ); 210 + 186 211 /* 187 212 ** This function attempts to scan each logical line within the blob to 188 213 ** determine the type of content it appears to contain. Possible return 189 214 ** values are: 190 215 ** 191 216 ** (1) -- The content appears to consist entirely of text, with lines 192 -** delimited by line-feed characters; however, the encoding may 193 -** not be UTF-8. 217 +** delimited by line-feed characters. 194 218 ** 195 219 ** (0) -- The content appears to be binary because it contains embedded 196 220 ** NUL characters or an extremely long line. Since this function 197 221 ** does not understand UTF-16, it may falsely consider UTF-16 text 198 222 ** to be binary. 199 223 ** 200 224 ** (-1) -- The content appears to consist entirely of text, with lines 201 -** delimited by carriage-return, line-feed pairs; however, the 202 -** encoding may not be UTF-8. 225 +** delimited by carriage-return, line-feed pairs. 226 +** 227 +** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII. 228 +** 229 +** (-4) -- The same as 0, but the determination is based on the fact that 230 +** the blob might be text (any encoding) but it has a line length 231 +** bigger than the diff logic in fossil can handle. 203 232 ** 204 233 ************************************ WARNING ********************************** 205 234 ** 206 -** This function does not validate that the blob content is properly formed 207 -** UTF-8. It assumes that all code points are the same size. It does not 208 -** validate any code points. It makes no attempt to detect if any [invalid] 209 -** switches between UTF-8 and other encodings occur. 235 +** This function does not validate any code points. 210 236 ** 211 237 ** The only code points that this function cares about are the NUL character, 212 238 ** carriage-return, and line-feed. 213 239 ** 214 240 ************************************ WARNING ********************************** 215 241 */ 216 242 int looks_like_utf8(const Blob *pContent){ 217 - const char *z = blob_buffer(pContent); 243 + const unsigned char *z = (unsigned char *) blob_buffer(pContent); 218 244 unsigned int n = blob_size(pContent); 219 - int j, c; 220 - int result = 1; /* Assume UTF-8 text with no CR/NL */ 245 + unsigned int j; 246 + unsigned char c; 247 + int result = 0; /* Assume UTF-8 text with no CR/NL */ 221 248 222 249 /* Check individual lines. 223 250 */ 224 - if( n==0 ) return result; /* Empty file -> text */ 251 + if( n==0 ) return 1; /* Empty file -> text */ 225 252 c = *z; 226 - if( c==0 ) return 0; /* Zero byte in a file -> binary */ 227 253 j = (c!='\n'); 254 + if( c&0x80 ){ 255 + CHECKUTF8(c) 256 + } else if( c==0 ){ 257 + return 0; /* Zero byte in a file -> binary */ 258 + } 228 259 while( --n>0 ){ 229 260 c = *++z; ++j; 230 - if( c==0 ) return 0; /* Zero byte in a file -> binary */ 231 - if( c=='\n' ){ 232 - int c2 = z[-1]; 233 - if( c2=='\r' ){ 234 - result = -1; /* Contains CR/NL, continue */ 261 + if( c&0x80 ){ 262 + CHECKUTF8(c) 263 + } else if( c==0 ){ 264 + return 0; /* Zero byte in a file -> binary */ 265 + } else if( c=='\n' ){ 266 + if( z[-1]=='\r' ){ 267 + result |= 2; /* Contains CR/NL, continue */ 235 268 } 236 269 if( j>LENGTH_MASK ){ 237 - return 0; /* Very long line -> binary */ 270 + return -4; /* Very long line -> binary */ 238 271 } 239 272 j = 0; 240 273 } 241 274 } 242 275 if( j>LENGTH_MASK ){ 243 - return 0; /* Very long line -> binary */ 276 + return -4; /* Very long line -> binary */ 244 277 } 245 - return result; /* No problems seen -> not binary */ 278 + return 1-result; /* No problems seen -> not binary */ 246 279 } 247 280 248 281 /* 249 282 ** Define the type needed to represent a Unicode (UTF-16) character. 250 283 */ 251 284 #ifndef WCHAR_T 252 285 # ifdef _WIN32 ................................................................................ 286 319 ** NUL characters or an extremely long line. Since this function 287 320 ** does not understand UTF-8, it may falsely consider UTF-8 text 288 321 ** to be binary. 289 322 ** 290 323 ** (-1) -- The content appears to consist entirely of text, with lines 291 324 ** delimited by carriage-return, line-feed pairs; however, the 292 325 ** encoding may not be UTF-16. 326 +** 327 +** (-4) -- The same as 0, but the determination is based on the fact that 328 +** the blob might be text (any encoding) but it has a line length 329 +** bigger than the diff logic in fossil can handle. 293 330 ** 294 331 ************************************ WARNING ********************************** 295 332 ** 296 333 ** This function does not validate that the blob content is properly formed 297 334 ** UTF-16. It assumes that all code points are the same size. It does not 298 335 ** validate any code points. It makes no attempt to detect if any [invalid] 299 336 ** switches between the UTF-16be and UTF-16le encodings occur. ................................................................................ 321 358 if( c==0 ) return 0; /* NUL character in a file -> binary */ 322 359 if( c==UTF16BE_LF || c==UTF16LE_LF ){ 323 360 int c2 = z[-1]; 324 361 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ 325 362 result = -1; /* Contains CR/NL, continue */ 326 363 } 327 364 if( j>UTF16_LENGTH_MASK ){ 328 - return 0; /* Very long line -> binary */ 365 + return -4; /* Very long line -> binary */ 329 366 } 330 367 j = 0; 331 368 } 332 369 } 333 370 if( j>UTF16_LENGTH_MASK ){ 334 - return 0; /* Very long line -> binary */ 371 + return -4; /* Very long line -> binary */ 335 372 } 336 373 return result; /* No problems seen -> not binary */ 337 374 } 338 375 339 376 /* 340 377 ** This function returns an array of bytes representing the byte-order-mark 341 378 ** for UTF-8.