Fossil: Changes On Branch improve_commit

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch improve_commit_warning Excluding Merge-Ins

This is equivalent to a diff from a98467b661 to fdf9050c4b

2013-02-18
13:46		Fixed ticket [5df2715635b99bd46a] (check-in count mismatch). check-in: b27c0d6d3f user: stephan tags: trunk
10:03		New function fossil_utf8_to_filename, such that fossil_unicode_to_utf8/fossil_utf8_to_unicode/fossil_unicode_free are not used on UNIX/MAC any more: On UNIX those 3 functions were only no-ops, but this allows to re-implement then for real unicode <-> utf-8 conversions. There is an "#ifdef _WIN32" around those 3 functions and 2 more (fossil_mbcs_to... Leaf check-in: cc3976fd30 user: jan.nijtmans tags: fossil_utf8_to_filename
08:30		merge trunk Leaf check-in: fdd51b617c user: jan.nijtmans tags: ticket-d17d6e5b17
2013-02-17
21:37		merge trunk Leaf check-in: fdf9050c4b user: jan.nijtmans tags: improve_commit_warning
14:47		More simplification in UTF-16 bom detection Leaf check-in: 1e70f211f9 user: jan.nijtmans tags: utf16Bom
14:43		Remove two unused variables check-in: a98467b661 user: jan.nijtmans tags: trunk
2013-02-16
14:12		Limit the complexity of the diff display on check-in information pages. check-in: 4f95ea8c56 user: drh tags: trunk
2013-02-07
09:39		merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning

Changes to src/blob.c.

  z = p->aData;
  for(i=j=0; z[i]; i++){
    if( z[i]!='\r' ) z[j++] = z[i];
  }
  z[j] = 0;
  p->nUsed = j;
}
























































/*
** Shell-escape the given string.  Append the result to a blob.
*/
void shell_escape(Blob *pBlob, const char *zIn){
  int n = blob_size(pBlob);
  int k = strlen(zIn);







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

  z = p->aData;
  for(i=j=0; z[i]; i++){
    if( z[i]!='\r' ) z[j++] = z[i];
  }
  z[j] = 0;
  p->nUsed = j;
}

/*
** Convert blob from cp1252 to utf-8. As cp1252 is a superset
** of iso8895-1, this is useful on UNIX as well.
**
** This table contains the character translations for 0x80..0xA0.
*/

static const unsigned short cp1252[32] = {
  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
};

void blob_cp1252_to_utf8(Blob *p){
  unsigned char *z = (unsigned char *)p->aData;
  int j   = p->nUsed;
  int i, n;
  for(i=n=0; i<j; i++){
    if( z[i]>=0x80 ){
      if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
        n++;
      }
      n++;
    }
  }
  j += n;
  if( j>=p->nAlloc ){
    blob_resize(p, j);
    z = (unsigned char *)p->aData;
  }
  p->nUsed = j;
  z[j] = 0;
  while( j>i ){
    if( z[--i]>=0x80 ){
      if( z[i]<0xa0 ){
        unsigned short sym = cp1252[z[i]&0x1f];
        if( sym>=0x800 ){
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0x80 | ((sym>>6)&0x3f);
          z[--j] = 0xe0 | (sym>>12);
        }else{
          z[--j] = 0x80 | (sym&0x3f);
          z[--j] = 0xc0 | (sym>>6);
        }
      }else{
        z[--j] = 0x80 | (z[i]&0x3F);
        z[--j] = 0xC0 | (z[i]>>6);
      }
    }else{
      z[--j] = z[i];
    }
  }
}

/*
** Shell-escape the given string.  Append the result to a blob.
*/
void shell_escape(Blob *pBlob, const char *zIn){
  int n = blob_size(pBlob);
  int k = strlen(zIn);

Changes to src/checkin.c.

  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = starts_with_utf16_bom(p, 0, 0);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);













































  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = starts_with_utf16_bom(p, 0, 0);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType<-2){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert;
    Blob ans;
    char cReply;

    if(eType==-4){
      if (binOk) goto go_on;
      zWarning = "long lines";
      zDisable = "\"binary-glob\" setting";
      zConvert = "";
    }else{
      if (encodingOk) goto go_on;
      zWarning = "invalid UTF-8";
      zDisable = "\"encoding-glob\" setting";
      zConvert = "c=convert/";
    }
    blob_zero(&ans);
    file_relative_name(zFilename, &fname, 0);
    zMsg = mprintf(
         "%s appears to be text, but contains %s. Use --no-warnings or the"
    	 " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ",
         blob_str(&fname), zWarning, zDisable, zConvert);
    prompt_user(zMsg, &ans);
    fossil_free(zMsg);
    cReply = blob_str(&ans)[0];
    if( *zConvert && (cReply=='c' || cReply=='C') ){
      char *zOrig = file_newname(zFilename, "original", 1);
      FILE *f;
      blob_write_to_file(p, zOrig);
      fossil_free(zOrig);
      f = fossil_fopen(zFilename, "wb");
      blob_cp1252_to_utf8(p);
      fwrite(blob_buffer(p), 1, blob_size(p), f);
      fclose(f);
      return 1;
    } else if( cReply!='y' && cReply!='Y' ){
      fossil_fatal("Abandoning commit due to %s in %s",
                   zWarning, blob_str(&fname));
    }
    blob_reset(&ans);
  go_on:
    eType +=4 ;
  }
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;

Changes to src/diff.c.

#define DIFF_TOO_MANY_CHANGES_TXT \
    "more than 10,000 changes\n"

#define DIFF_TOO_MANY_CHANGES_HTML \
    "<p class='generalError'>More than 10,000 changes</p>\n"

#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
  }

  /* Return results */
  *pnLine = nLine;
  return a;
}


























/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-8.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.





**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-8.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between UTF-8 and other encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;

  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  j = (c!='\n');





  while( --n>0 ){
    c = *++z; ++j;



    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      int c2 = z[-1];
      if( c2=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
................................................................................
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.




**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
................................................................................
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






|
<







|
|
>
>
>
>
>



|
<
<
<







|

|
>
|



|

<

>
>
>
>
>


>
>
>
|
|
<
|
|


|





|

|







 







>
>
>
>







 







|





|

#define DIFF_TOO_MANY_CHANGES_TXT \
    "more than 10,000 changes\n"

#define DIFF_TOO_MANY_CHANGES_HTML \
    "<p class='generalError'>More than 10,000 changes</p>\n"

#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
  }

  /* Return results */
  *pnLine = nLine;
  return a;
}

/*
** Macro which checks for proper UTF-8, when the first byte >= 0x80
** It uses the method described in:
**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
** except for the "overlong form" which is not considered
** invalid: Some languages like Java and Tcl use it.
**
** Any invalid byte causes bit 2 of result to be set (result |= 4),
** otherwise for valid multibyte utf-8 sequences n, j and z are
** updated so the continuation bytes are not checked again.
 */
#define CHECKUTF8(c) \
if( c<0xC0 || c>=0xF8 ){ \
  result |= 4;  /* Invalid 1-byte or multibyte UTF-8, continue */ \
}else do{ \
  /* Check if all continuation bytes >=0x80 and <0xC0 */ \
  if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
    result |= 4; /* Invalid continuation byte, continue */ \
    break; \
  }else{ \
    /* prepare for checking remaining continuation bytes */ \
    c<<=1; --n; ++j; ++z; \
  } \
}while( c>=0xC0 );

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters.

**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs.
**
** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate any code points.



**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  unsigned int j;
  unsigned char c;
  int result = 0;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return 1;  /* Empty file -> text */
  c = *z;

  j = (c!='\n');
  if( c&0x80 ){
    CHECKUTF8(c)
  } else if( c==0 ){
    return 0;  /* Zero byte in a file -> binary */
  }
  while( --n>0 ){
    c = *++z; ++j;
    if( c&0x80 ){
      CHECKUTF8(c)
    } else if( c==0 ){
      return 0;  /* Zero byte in a file -> binary */
    } else if( c=='\n' ){

      if( z[-1]=='\r' ){
        result |= 2;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        return -4;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>LENGTH_MASK ){
    return -4;  /* Very long line -> binary */
  }
  return 1-result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
................................................................................
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
................................................................................
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return -4;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( j>UTF16_LENGTH_MASK ){
    return -4;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.