Fossil: Changes On Branch commitWarningV2

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch commitWarningV2 Excluding Merge-Ins

This is equivalent to a diff from ef6c243ed9 to 7d3a06b89a

2012-11-02
02:27		Merge commit warning and file content type detection changes to trunk. check-in: 0c7c61447f user: mistachkin tags: trunk
2012-11-01
20:19		Add detection of binary data with no leading UTF-16 byte-order-mark. Closed-Leaf check-in: 7d3a06b89a user: mistachkin tags: commitWarningV2
20:09		Improve detection of UTF-8, UTF-16, binary data, and carriage returns during commit operations. check-in: c837e44445 user: mistachkin tags: commitWarningV2
12:32		merge trunk check-in: 9e97de3410 user: jan.nijtmans tags: use-blob_strip_bom
11:48		merge trunk let looks_like_text() give different values for UTF-16 BE/LE. Not used yet. check-in: 348637dedf user: jan.nijtmans tags: improve_looks_like_binary
10:20		Restore Style fix, which got lost by [618258421767778c] check-in: ef6c243ed9 user: jan.nijtmans tags: trunk
07:40		dont check for same BOM twice check-in: 8c32e6f0dd user: jan.nijtmans tags: trunk

Changes to src/checkin.c.

/*
** Issue a warning and give the user an opportunity to abandon out
** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
** is seen in a text file.
*/
static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
  int eType;              /* return value of looks_like_text() */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  eType = looks_like_text(p);
  fUnicode = starts_with_utf16_bom(p);
  if( eType==-1 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";


    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",







|






|
|
|











>
>

/*
** Issue a warning and give the user an opportunity to abandon out
** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
** is seen in a text file.
*/
static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
  int eType;              /* return value of looks_like_utf8/utf16() */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return;
  fUnicode = starts_with_utf16_bom(p);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    Blob ans;
    char cReply;

    if( eType==-1 && fUnicode ){
      zWarning = "Unicode and CR/NL line endings";
    }else if( eType==-1 ){
      if( crnlOk ){
        return; /* We don't want CR/NL warnings for this file. */
      }
      zWarning = "CR/NL line endings";
    }else if( eType==0 ){
      zWarning = "binary data";
    }else{
      zWarning = "Unicode";
    }
    file_relative_name(zFilename, &fname, 0);
    blob_zero(&ans);
    zMsg = mprintf(
         "%s contains %s.  commit anyhow (a=all/y/N)? ",

Changes to src/diff.c.

*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)

/*
** Information about each line of a file being diffed.
**
................................................................................
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL (\000) characters or an extremely long line.  Since this
**         function does not understand UTF-16, it may falsely consider
**         UTF-16 text to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
*/
int looks_like_text(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* \000 byte in a file -> binary */
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* \000 byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */




































































  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns non-zero if the blob starts with a UTF-16le or
** UTF-16be byte-order-mark (BOM).







|



|







 







|
|
|






|



|





|



|












>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

*/
#define DIFF_CANNOT_COMPUTE_BINARY \
    "cannot compute difference between binary files\n"

#define DIFF_CANNOT_COMPUTE_SYMLINK \
    "cannot compute difference between symlink and regular file\n"

#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (8192)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)

/*
** Information about each line of a file being diffed.
**
................................................................................
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      if( z[-1]=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-1)
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-16.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
*/
int looks_like_utf16(const Blob *pContent){
  const wchar_t *z = (wchar_t *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-16 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns non-zero if the blob starts with a UTF-16le or
** UTF-16be byte-order-mark (BOM).