Changes On Branch commitWarningV2
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch commitWarningV2 Excluding Merge-Ins

This is equivalent to a diff from ef6c243ed9 to 7d3a06b89a

2012-11-02
02:27
Merge commit warning and file content type detection changes to trunk. check-in: 0c7c61447f user: mistachkin tags: trunk
2012-11-01
20:19
Add detection of binary data with no leading UTF-16 byte-order-mark. Closed-Leaf check-in: 7d3a06b89a user: mistachkin tags: commitWarningV2
20:09
Improve detection of UTF-8, UTF-16, binary data, and carriage returns during commit operations. check-in: c837e44445 user: mistachkin tags: commitWarningV2
12:32
merge trunk check-in: 9e97de3410 user: jan.nijtmans tags: use-blob_strip_bom
11:48
merge trunk let looks_like_text() give different values for UTF-16 BE/LE. Not used yet. check-in: 348637dedf user: jan.nijtmans tags: improve_looks_like_binary
10:20
Restore Style fix, which got lost by [618258421767778c] check-in: ef6c243ed9 user: jan.nijtmans tags: trunk
07:40
dont check for same BOM twice check-in: 8c32e6f0dd user: jan.nijtmans tags: trunk

Changes to src/checkin.c.

   884    884   
   885    885   /*
   886    886   ** Issue a warning and give the user an opportunity to abandon out
   887    887   ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending
   888    888   ** is seen in a text file.
   889    889   */
   890    890   static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){
   891         -  int eType;              /* return value of looks_like_text() */
          891  +  int eType;              /* return value of looks_like_utf8/utf16() */
   892    892     int fUnicode;           /* return value of starts_with_utf16_bom() */
   893    893     char *zMsg;             /* Warning message */
   894    894     Blob fname;             /* Relative pathname of the file */
   895    895     static int allOk = 0;   /* Set to true to disable this routine */
   896    896   
   897    897     if( allOk ) return;
   898         -  eType = looks_like_text(p);
   899    898     fUnicode = starts_with_utf16_bom(p);
   900         -  if( eType==-1 || fUnicode ){
          899  +  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
          900  +  if( eType==0 || eType==-1 || fUnicode ){
   901    901       const char *zWarning;
   902    902       Blob ans;
   903    903       char cReply;
   904    904   
   905    905       if( eType==-1 && fUnicode ){
   906    906         zWarning = "Unicode and CR/NL line endings";
   907    907       }else if( eType==-1 ){
   908    908         if( crnlOk ){
   909    909           return; /* We don't want CR/NL warnings for this file. */
   910    910         }
   911    911         zWarning = "CR/NL line endings";
          912  +    }else if( eType==0 ){
          913  +      zWarning = "binary data";
   912    914       }else{
   913    915         zWarning = "Unicode";
   914    916       }
   915    917       file_relative_name(zFilename, &fname, 0);
   916    918       blob_zero(&ans);
   917    919       zMsg = mprintf(
   918    920            "%s contains %s.  commit anyhow (a=all/y/N)? ",

Changes to src/diff.c.

    46     46   */
    47     47   #define DIFF_CANNOT_COMPUTE_BINARY \
    48     48       "cannot compute difference between binary files\n"
    49     49   
    50     50   #define DIFF_CANNOT_COMPUTE_SYMLINK \
    51     51       "cannot compute difference between symlink and regular file\n"
    52     52   
    53         -#define looks_like_binary(blob) (looks_like_text((blob)) == 0)
           53  +#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
    54     54   #endif /* INTERFACE */
    55     55   
    56     56   /*
    57         -** Maximum length of a line in a text file.  (8192)
           57  +** Maximum length of a line in a text file, in bytes.  (8192)
    58     58   */
    59     59   #define LENGTH_MASK_SZ  13
    60     60   #define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
    61     61   
    62     62   /*
    63     63   ** Information about each line of a file being diffed.
    64     64   **
................................................................................
   177    177   ** values are:
   178    178   **
   179    179   **  (1) -- The content appears to consist entirely of text, with lines
   180    180   **         delimited by line-feed characters; however, the encoding may
   181    181   **         not be UTF-8.
   182    182   **
   183    183   **  (0) -- The content appears to be binary because it contains embedded
   184         -**         NUL (\000) characters or an extremely long line.  Since this
   185         -**         function does not understand UTF-16, it may falsely consider
   186         -**         UTF-16 text to be binary.
          184  +**         NUL characters or an extremely long line.  Since this function
          185  +**         does not understand UTF-16, it may falsely consider UTF-16 text
          186  +**         to be binary.
   187    187   **
   188    188   ** (-1) -- The content appears to consist entirely of text, with lines
   189    189   **         delimited by carriage-return, line-feed pairs; however, the
   190    190   **         encoding may not be UTF-8.
   191    191   **
   192    192   */
   193         -int looks_like_text(const Blob *pContent){
          193  +int looks_like_utf8(const Blob *pContent){
   194    194     const char *z = blob_buffer(pContent);
   195    195     unsigned int n = blob_size(pContent);
   196    196     int j, c;
   197         -  int result = 1;  /* Assume text with no CR/NL */
          197  +  int result = 1;  /* Assume UTF-8 text with no CR/NL */
   198    198   
   199    199     /* Check individual lines.
   200    200     */
   201    201     if( n==0 ) return result;  /* Empty file -> text */
   202    202     c = *z;
   203         -  if( c==0 ) return 0;  /* \000 byte in a file -> binary */
          203  +  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
   204    204     j = (c!='\n');
   205    205     while( --n>0 ){
   206    206       c = *++z; ++j;
   207         -    if( c==0 ) return 0;  /* \000 byte in a file -> binary */
          207  +    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
   208    208       if( c=='\n' ){
   209    209         if( z[-1]=='\r' ){
   210    210           result = -1;  /* Contains CR/NL, continue */
   211    211         }
   212    212         if( j>LENGTH_MASK ){
   213    213           return 0;  /* Very long line -> binary */
   214    214         }
   215    215         j = 0;
   216    216       }
   217    217     }
   218    218     if( j>LENGTH_MASK ){
   219    219       return 0;  /* Very long line -> binary */
          220  +  }
          221  +  return result;  /* No problems seen -> not binary */
          222  +}
          223  +
          224  +/*
          225  +** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
          226  +** The number of bytes represented by this value cannot exceed LENGTH_MASK
          227  +** bytes, because that is the line buffer size by the diff engine.
          228  +*/
          229  +#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-1)
          230  +#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)
          231  +
          232  +/*
          233  +** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
          234  +** encodings.
          235  +*/
          236  +#define UTF16BE_CR  ((wchar_t)'\r')
          237  +#define UTF16BE_LF  ((wchar_t)'\n')
          238  +#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
          239  +#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
          240  +
          241  +/*
          242  +** This function attempts to scan each logical line within the blob to
          243  +** determine the type of content it appears to contain.  Possible return
          244  +** values are:
          245  +**
          246  +**  (1) -- The content appears to consist entirely of text, with lines
          247  +**         delimited by line-feed characters; however, the encoding may
          248  +**         not be UTF-16.
          249  +**
          250  +**  (0) -- The content appears to be binary because it contains embedded
          251  +**         NUL characters or an extremely long line.  Since this function
          252  +**         does not understand UTF-8, it may falsely consider UTF-8 text
          253  +**         to be binary.
          254  +**
          255  +** (-1) -- The content appears to consist entirely of text, with lines
          256  +**         delimited by carriage-return, line-feed pairs; however, the
          257  +**         encoding may not be UTF-16.
          258  +**
          259  +*/
          260  +int looks_like_utf16(const Blob *pContent){
          261  +  const wchar_t *z = (wchar_t *)blob_buffer(pContent);
          262  +  unsigned int n = blob_size(pContent);
          263  +  int j, c;
          264  +  int result = 1;  /* Assume UTF-16 text with no CR/NL */
          265  +
          266  +  /* Check individual lines.
          267  +  */
          268  +  if( n==0 ) return result;  /* Empty file -> text */
          269  +  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
          270  +  c = *z;
          271  +  if( c==0 ) return 0;  /* NUL character in a file -> binary */
          272  +  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
          273  +  while( (n-=2)>0 ){
          274  +    c = *++z; ++j;
          275  +    if( c==0 ) return 0;  /* NUL character in a file -> binary */
          276  +    if( c==UTF16BE_LF || c==UTF16LE_LF ){
          277  +      if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){
          278  +        result = -1;  /* Contains CR/NL, continue */
          279  +      }
          280  +      if( j>UTF16_LENGTH_MASK ){
          281  +        return 0;  /* Very long line -> binary */
          282  +      }
          283  +      j = 0;
          284  +    }
          285  +  }
          286  +  if( j>UTF16_LENGTH_MASK ){
          287  +    return 0;  /* Very long line -> binary */
   220    288     }
   221    289     return result;  /* No problems seen -> not binary */
   222    290   }
   223    291   
   224    292   /*
   225    293   ** This function returns non-zero if the blob starts with a UTF-16le or
   226    294   ** UTF-16be byte-order-mark (BOM).