Changes On Branch improve_commit_warning
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch improve_commit_warning Excluding Merge-Ins

This is equivalent to a diff from a98467b661 to fdf9050c4b

2013-02-18
13:46
Fixed ticket [5df2715635b99bd46a] (check-in count mismatch). check-in: b27c0d6d3f user: stephan tags: trunk
10:03
New function fossil_utf8_to_filename, such that fossil_unicode_to_utf8/fossil_utf8_to_unicode/fossil_unicode_free are not used on UNIX/MAC any more: On UNIX those 3 functions were only no-ops, but this allows to re-implement then for real unicode <-> utf-8 conversions. There is an "#ifdef _WIN32" around those 3 functions and 2 more (fossil_mbcs_to... Leaf check-in: cc3976fd30 user: jan.nijtmans tags: fossil_utf8_to_filename
08:30
merge trunk Leaf check-in: fdd51b617c user: jan.nijtmans tags: ticket-d17d6e5b17
2013-02-17
21:37
merge trunk Leaf check-in: fdf9050c4b user: jan.nijtmans tags: improve_commit_warning
14:47
More simplification in UTF-16 bom detection Leaf check-in: 1e70f211f9 user: jan.nijtmans tags: utf16Bom
14:43
Remove two unused variables check-in: a98467b661 user: jan.nijtmans tags: trunk
2013-02-16
14:12
Limit the complexity of the diff display on check-in information pages. check-in: 4f95ea8c56 user: drh tags: trunk
2013-02-07
09:39
merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning

Changes to src/blob.c.

1025
1026
1027
1028
1029
1030
1031























































1032
1033
1034
1035
1036
1037
1038
  z = p->aData;
  for(i=j=0; z[i]; i++){
    if( z[i]!='\r' ) z[j++] = z[i];
  }
  z[j] = 0;
  p->nUsed = j;
}
























































/*
** Shell-escape the given string.  Append the result to a blob.
*/
void shell_escape(Blob *pBlob, const char *zIn){
  int n = blob_size(pBlob);
  int k = strlen(zIn);






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
  z = p->aData;
  for(i=j=0; z[i]; i++){
    if( z[i]!='\r' ) z[j++] = z[i];
  }
  z[j] = 0;
  p->nUsed = j;
}

/*
** Convert blob from cp1252 to utf-8. As cp1252 is a superset
** of iso8895-1, this is useful on UNIX as well.
**
** This table contains the character translations for 0x80..0xA0.
*/

static const unsigned short cp1252[32] = {
  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
};

void blob_cp1252_to_utf8(Blob *p){
  unsigned char *z = (unsigned char *)p->aData;
  int j   = p->nUsed;
  int i, n;
  for(i=n=0; i<j; i++){
    if( z[i]>=0x80 ){
      if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
        n++;
      }
      n++;
    }
  }
  j += n;
  if( j>=p->nAlloc ){
    blob_resize(p, j);
    z = (unsigned char *)p->aData;
  }
  p->nUsed = j;
  z[j] = 0;
  while( j>i ){
    if( z[--i]>=0x80 ){
      if( z[i]<0xa0 ){
        unsigned short sym = cp1252[z[i]&0x1f];
        if( sym>=0x800 ){
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0x80 | ((sym>>6)&0x3f);
          z[--j] = 0xe0 | (sym>>12);
        }else{
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0xc0 | (sym>>6);
        }
      }else{
        z[--j] = 0x80 | (z[i]&0x3F);
        z[--j] = 0xC0 | (z[i]>>6);
      }
    }else{
      z[--j] = z[i];
    }
  }
}

/*
** Shell-escape the given string.  Append the result to a blob.
*/
void shell_escape(Blob *pBlob, const char *zIn){
  int n = blob_size(pBlob);
  int k = strlen(zIn);

Changes to src/checkin.c.

906
907
908
909
910
911
912













































913
914
915
916
917
918
919
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = starts_with_utf16_bom(p, 0, 0);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);













































  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = starts_with_utf16_bom(p, 0, 0);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType<-2){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert;
    Blob ans;
    char cReply;

    if(eType==-4){
      if (binOk) goto go_on;
      zWarning = "long lines";
      zDisable = "\"binary-glob\" setting";
      zConvert = "";
    }else{
      if (encodingOk) goto go_on;
      zWarning = "invalid UTF-8";
      zDisable = "\"encoding-glob\" setting";
      zConvert = "c=convert/";
    }
    blob_zero(&ans);
    file_relative_name(zFilename, &fname, 0);
    zMsg = mprintf(
         "%s appears to be text, but contains %s. Use --no-warnings or the"
    	 " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ",
         blob_str(&fname), zWarning, zDisable, zConvert);
    prompt_user(zMsg, &ans);
    fossil_free(zMsg);
    cReply = blob_str(&ans)[0];
    if( *zConvert && (cReply=='c' || cReply=='C') ){
      char *zOrig = file_newname(zFilename, "original", 1);
      FILE *f;
      blob_write_to_file(p, zOrig);
      fossil_free(zOrig);
      f = fossil_fopen(zFilename, "wb");
      blob_cp1252_to_utf8(p);
      fwrite(blob_buffer(p), 1, blob_size(p), f);
      fclose(f);
      return 1;
    } else if( cReply!='y' && cReply!='Y' ){
      fossil_fatal("Abandoning commit due to %s in %s",
                   zWarning, blob_str(&fname));
    }
    blob_reset(&ans);
  go_on:
    eType +=4 ;
  }
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;

Changes to src/diff.c.

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
...
179
180
181
182
183
184
185

























186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202





203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

220
221
222
223
224
225
226
227





228
229



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
...
286
287
288
289
290
291
292




293
294
295
296
297
298
299
...
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#define DIFF_TOO_MANY_CHANGES_TXT \
    "more than 10,000 changes\n"

#define DIFF_TOO_MANY_CHANGES_HTML \
    "<p class='generalError'>More than 10,000 changes</p>\n"

#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
  }

  /* Return results */
  *pnLine = nLine;
  return a;
}


























/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.





**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-8.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between UTF-8 and other encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;

  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  j = (c!='\n');





  while( --n>0 ){
    c = *++z; ++j;



    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      int c2 = z[-1];
      if( c2=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
................................................................................
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.




**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
................................................................................
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.






|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






|
<







|
|
>
>
>
>
>



|
<
<
<







|

|
>
|



|

<

>
>
>
>
>


>
>
>
|
|
<
|
|


|





|

|







 







>
>
>
>







 







|





|







55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

253
254
255
256
257
258
259
260
261
262
263
264
265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
...
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
...
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#define DIFF_TOO_MANY_CHANGES_TXT \
    "more than 10,000 changes\n"

#define DIFF_TOO_MANY_CHANGES_HTML \
    "<p class='generalError'>More than 10,000 changes</p>\n"

#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
  }

  /* Return results */
  *pnLine = nLine;
  return a;
}

/*
** Macro which checks for proper UTF-8, when the first byte >= 0x80
** It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" which is not considered
** invalid: Some languages like Java and Tcl use it.
**
** Any invalid byte causes bit 2 of result to be set (result |= 4),
** otherwise for valid multibyte utf-8 sequences n, j and z are
** updated so the continuation bytes are not checked again.
 */
#define CHECKUTF8(c) \
if( c<0xC0 || c>=0xF8 ){ \
  result |= 4;  /* Invalid 1-byte or multibyte UTF-8, continue */ \
}else do{ \
  /* Check if all continuation bytes >=0x80 and <0xC0 */ \
  if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
    result |= 4; /* Invalid continuation byte, continue */ \
    break; \
  }else{ \
    /* prepare for checking remaining continuation bytes */ \
    c<<=1; --n; ++j; ++z; \
  } \
}while( c>=0xC0 );

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters.

**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs.
**
** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate any code points.



**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned int j;
  unsigned char c;
  int result = 0;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return 1;  /* Empty file -> text */
  c = *z;

  j = (c!='\n');
  if( c&0x80 ){
    CHECKUTF8(c)
  } else if( c==0 ){
    return 0;  /* Zero byte in a file -> binary */
  }
  while( --n>0 ){
    c = *++z; ++j;
    if( c&0x80 ){
      CHECKUTF8(c)
    } else if( c==0 ){
      return 0;  /* Zero byte in a file -> binary */
    } else if( c=='\n' ){

      if( z[-1]=='\r' ){
        result |= 2;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return -4;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return -4;  /* Very long line -> binary */
  }
  return 1-result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
................................................................................
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
................................................................................
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return -4;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return -4;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.