Changes On Branch improve_commit_warning
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch improve_commit_warning Excluding Merge-Ins

This is equivalent to a diff from a98467b661 to fdf9050c4b

2013-02-18
13:46
Fixed ticket [5df2715635b99bd46a] (check-in count mismatch). check-in: b27c0d6d3f user: stephan tags: trunk
10:03
New function fossil_utf8_to_filename, such that fossil_unicode_to_utf8/fossil_utf8_to_unicode/fossil_unicode_free are not used on UNIX/MAC any more: On UNIX those 3 functions were only no-ops, but this allows to re-implement then for real unicode <-> utf-8 conversions. There is an "#ifdef _WIN32" around those 3 functions and 2 more (fossil_mbcs_to... Leaf check-in: cc3976fd30 user: jan.nijtmans tags: fossil_utf8_to_filename
08:30
merge trunk Leaf check-in: fdd51b617c user: jan.nijtmans tags: ticket-d17d6e5b17
2013-02-17
21:37
merge trunk Leaf check-in: fdf9050c4b user: jan.nijtmans tags: improve_commit_warning
14:47
More simplification in UTF-16 bom detection Leaf check-in: 1e70f211f9 user: jan.nijtmans tags: utf16Bom
14:43
Remove two unused variables check-in: a98467b661 user: jan.nijtmans tags: trunk
2013-02-16
14:12
Limit the complexity of the diff display on check-in information pages. check-in: 4f95ea8c56 user: drh tags: trunk
2013-02-07
09:39
merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning

Changes to src/blob.c.

  1025   1025     z = p->aData;
  1026   1026     for(i=j=0; z[i]; i++){
  1027   1027       if( z[i]!='\r' ) z[j++] = z[i];
  1028   1028     }
  1029   1029     z[j] = 0;
  1030   1030     p->nUsed = j;
  1031   1031   }
         1032  +
         1033  +/*
         1034  +** Convert blob from cp1252 to utf-8. As cp1252 is a superset
         1035  +** of iso8895-1, this is useful on UNIX as well.
         1036  +**
         1037  +** This table contains the character translations for 0x80..0xA0.
         1038  +*/
         1039  +
         1040  +static const unsigned short cp1252[32] = {
         1041  +  0x20ac,   0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
         1042  +  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152,   0x8D, 0x017D,   0x8F,
         1043  +    0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
         1044  +   0x2DC, 0x2122, 0x0161, 0x203A, 0x0153,   0x9D, 0x017E, 0x0178
         1045  +};
         1046  +
         1047  +void blob_cp1252_to_utf8(Blob *p){
         1048  +  unsigned char *z = (unsigned char *)p->aData;
         1049  +  int j   = p->nUsed;
         1050  +  int i, n;
         1051  +  for(i=n=0; i<j; i++){
         1052  +    if( z[i]>=0x80 ){
         1053  +      if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
         1054  +        n++;
         1055  +      }
         1056  +      n++;
         1057  +    }
         1058  +  }
         1059  +  j += n;
         1060  +  if( j>=p->nAlloc ){
         1061  +    blob_resize(p, j);
         1062  +    z = (unsigned char *)p->aData;
         1063  +  }
         1064  +  p->nUsed = j;
         1065  +  z[j] = 0;
         1066  +  while( j>i ){
         1067  +    if( z[--i]>=0x80 ){
         1068  +      if( z[i]<0xa0 ){
         1069  +        unsigned short sym = cp1252[z[i]&0x1f];
         1070  +        if( sym>=0x800 ){
         1071  +          z[--j] = 0x80 | (sym&0x3f);
         1072  +          z[--j] = 0x80 | ((sym>>6)&0x3f);
         1073  +          z[--j] = 0xe0 | (sym>>12);
         1074  +        }else{
         1075  +          z[--j] = 0x80 | (sym&0x3f);
         1076  +          z[--j] = 0xc0 | (sym>>6);
         1077  +        }
         1078  +      }else{
         1079  +        z[--j] = 0x80 | (z[i]&0x3F);
         1080  +        z[--j] = 0xC0 | (z[i]>>6);
         1081  +      }
         1082  +    }else{
         1083  +      z[--j] = z[i];
         1084  +    }
         1085  +  }
         1086  +}
  1032   1087   
  1033   1088   /*
  1034   1089   ** Shell-escape the given string.  Append the result to a blob.
  1035   1090   */
  1036   1091   void shell_escape(Blob *pBlob, const char *zIn){
  1037   1092     int n = blob_size(pBlob);
  1038   1093     int k = strlen(zIn);

Changes to src/checkin.c.

   906    906     char *zMsg;             /* Warning message */
   907    907     Blob fname;             /* Relative pathname of the file */
   908    908     static int allOk = 0;   /* Set to true to disable this routine */
   909    909   
   910    910     if( allOk ) return 0;
   911    911     fUnicode = starts_with_utf16_bom(p, 0, 0);
   912    912     eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
          913  +  if( eType<-2){
          914  +    const char *zWarning;
          915  +    const char *zDisable;
          916  +    const char *zConvert;
          917  +    Blob ans;
          918  +    char cReply;
          919  +
          920  +    if(eType==-4){
          921  +      if (binOk) goto go_on;
          922  +      zWarning = "long lines";
          923  +      zDisable = "\"binary-glob\" setting";
          924  +      zConvert = "";
          925  +    }else{
          926  +      if (encodingOk) goto go_on;
          927  +      zWarning = "invalid UTF-8";
          928  +      zDisable = "\"encoding-glob\" setting";
          929  +      zConvert = "c=convert/";
          930  +    }
          931  +    blob_zero(&ans);
          932  +    file_relative_name(zFilename, &fname, 0);
          933  +    zMsg = mprintf(
          934  +         "%s appears to be text, but contains %s. Use --no-warnings or the"
          935  +    	 " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ",
          936  +         blob_str(&fname), zWarning, zDisable, zConvert);
          937  +    prompt_user(zMsg, &ans);
          938  +    fossil_free(zMsg);
          939  +    cReply = blob_str(&ans)[0];
          940  +    if( *zConvert && (cReply=='c' || cReply=='C') ){
          941  +      char *zOrig = file_newname(zFilename, "original", 1);
          942  +      FILE *f;
          943  +      blob_write_to_file(p, zOrig);
          944  +      fossil_free(zOrig);
          945  +      f = fossil_fopen(zFilename, "wb");
          946  +      blob_cp1252_to_utf8(p);
          947  +      fwrite(blob_buffer(p), 1, blob_size(p), f);
          948  +      fclose(f);
          949  +      return 1;
          950  +    } else if( cReply!='y' && cReply!='Y' ){
          951  +      fossil_fatal("Abandoning commit due to %s in %s",
          952  +                   zWarning, blob_str(&fname));
          953  +    }
          954  +    blob_reset(&ans);
          955  +  go_on:
          956  +    eType +=4 ;
          957  +  }
   913    958     if( eType==0 || eType==-1 || fUnicode ){
   914    959       const char *zWarning;
   915    960       const char *zDisable;
   916    961       const char *zConvert = "c=convert/";
   917    962       Blob ans;
   918    963       char cReply;
   919    964   

Changes to src/diff.c.

    55     55   
    56     56   #define DIFF_TOO_MANY_CHANGES_TXT \
    57     57       "more than 10,000 changes\n"
    58     58   
    59     59   #define DIFF_TOO_MANY_CHANGES_HTML \
    60     60       "<p class='generalError'>More than 10,000 changes</p>\n"
    61     61   
    62         -#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
           62  +#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0)
    63     63   #endif /* INTERFACE */
    64     64   
    65     65   /*
    66     66   ** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
    67     67   */
    68     68   #define LENGTH_MASK_SZ  13
    69     69   #define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
................................................................................
   179    179     }
   180    180   
   181    181     /* Return results */
   182    182     *pnLine = nLine;
   183    183     return a;
   184    184   }
   185    185   
          186  +/*
          187  +** Macro which checks for proper UTF-8, when the first byte >= 0x80
          188  +** It uses the method described in:
          189  +**   http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
          190  +** except for the "overlong form" which is not considered
          191  +** invalid: Some languages like Java and Tcl use it.
          192  +**
          193  +** Any invalid byte causes bit 2 of result to be set (result |= 4),
          194  +** otherwise for valid multibyte utf-8 sequences n, j and z are
          195  +** updated so the continuation bytes are not checked again.
          196  + */
          197  +#define CHECKUTF8(c) \
          198  +if( c<0xC0 || c>=0xF8 ){ \
          199  +  result |= 4;  /* Invalid 1-byte or multibyte UTF-8, continue */ \
          200  +}else do{ \
          201  +  /* Check if all continuation bytes >=0x80 and <0xC0 */ \
          202  +  if( n<2 || ((z[1]&0xC0)!=0x80) ){ \
          203  +    result |= 4; /* Invalid continuation byte, continue */ \
          204  +    break; \
          205  +  }else{ \
          206  +    /* prepare for checking remaining continuation bytes */ \
          207  +    c<<=1; --n; ++j; ++z; \
          208  +  } \
          209  +}while( c>=0xC0 );
          210  +
   186    211   /*
   187    212   ** This function attempts to scan each logical line within the blob to
   188    213   ** determine the type of content it appears to contain.  Possible return
   189    214   ** values are:
   190    215   **
   191    216   **  (1) -- The content appears to consist entirely of text, with lines
   192         -**         delimited by line-feed characters; however, the encoding may
   193         -**         not be UTF-8.
          217  +**         delimited by line-feed characters.
   194    218   **
   195    219   **  (0) -- The content appears to be binary because it contains embedded
   196    220   **         NUL characters or an extremely long line.  Since this function
   197    221   **         does not understand UTF-16, it may falsely consider UTF-16 text
   198    222   **         to be binary.
   199    223   **
   200    224   ** (-1) -- The content appears to consist entirely of text, with lines
   201         -**         delimited by carriage-return, line-feed pairs; however, the
   202         -**         encoding may not be UTF-8.
          225  +**         delimited by carriage-return, line-feed pairs.
          226  +**
          227  +** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII.
          228  +**
          229  +** (-4) -- The same as 0, but the determination is based on the fact that
          230  +**         the blob might be text (any encoding) but it has a line length
          231  +**         bigger than the diff logic in fossil can handle.
   203    232   **
   204    233   ************************************ WARNING **********************************
   205    234   **
   206         -** This function does not validate that the blob content is properly formed
   207         -** UTF-8.  It assumes that all code points are the same size.  It does not
   208         -** validate any code points.  It makes no attempt to detect if any [invalid]
   209         -** switches between UTF-8 and other encodings occur.
          235  +** This function does not validate any code points.
   210    236   **
   211    237   ** The only code points that this function cares about are the NUL character,
   212    238   ** carriage-return, and line-feed.
   213    239   **
   214    240   ************************************ WARNING **********************************
   215    241   */
   216    242   int looks_like_utf8(const Blob *pContent){
   217         -  const char *z = blob_buffer(pContent);
          243  +  const unsigned char *z = (unsigned char *) blob_buffer(pContent);
   218    244     unsigned int n = blob_size(pContent);
   219         -  int j, c;
   220         -  int result = 1;  /* Assume UTF-8 text with no CR/NL */
          245  +  unsigned int j;
          246  +  unsigned char c;
          247  +  int result = 0;  /* Assume UTF-8 text with no CR/NL */
   221    248   
   222    249     /* Check individual lines.
   223    250     */
   224         -  if( n==0 ) return result;  /* Empty file -> text */
          251  +  if( n==0 ) return 1;  /* Empty file -> text */
   225    252     c = *z;
   226         -  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
   227    253     j = (c!='\n');
          254  +  if( c&0x80 ){
          255  +    CHECKUTF8(c)
          256  +  } else if( c==0 ){
          257  +    return 0;  /* Zero byte in a file -> binary */
          258  +  }
   228    259     while( --n>0 ){
   229    260       c = *++z; ++j;
   230         -    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
   231         -    if( c=='\n' ){
   232         -      int c2 = z[-1];
   233         -      if( c2=='\r' ){
   234         -        result = -1;  /* Contains CR/NL, continue */
          261  +    if( c&0x80 ){
          262  +      CHECKUTF8(c)
          263  +    } else if( c==0 ){
          264  +      return 0;  /* Zero byte in a file -> binary */
          265  +    } else if( c=='\n' ){
          266  +      if( z[-1]=='\r' ){
          267  +        result |= 2;  /* Contains CR/NL, continue */
   235    268         }
   236    269         if( j>LENGTH_MASK ){
   237         -        return 0;  /* Very long line -> binary */
          270  +        return -4;  /* Very long line -> binary */
   238    271         }
   239    272         j = 0;
   240    273       }
   241    274     }
   242    275     if( j>LENGTH_MASK ){
   243         -    return 0;  /* Very long line -> binary */
          276  +    return -4;  /* Very long line -> binary */
   244    277     }
   245         -  return result;  /* No problems seen -> not binary */
          278  +  return 1-result;  /* No problems seen -> not binary */
   246    279   }
   247    280   
   248    281   /*
   249    282   ** Define the type needed to represent a Unicode (UTF-16) character.
   250    283   */
   251    284   #ifndef WCHAR_T
   252    285   #  ifdef _WIN32
................................................................................
   286    319   **         NUL characters or an extremely long line.  Since this function
   287    320   **         does not understand UTF-8, it may falsely consider UTF-8 text
   288    321   **         to be binary.
   289    322   **
   290    323   ** (-1) -- The content appears to consist entirely of text, with lines
   291    324   **         delimited by carriage-return, line-feed pairs; however, the
   292    325   **         encoding may not be UTF-16.
          326  +**
          327  +** (-4) -- The same as 0, but the determination is based on the fact that
          328  +**         the blob might be text (any encoding) but it has a line length
          329  +**         bigger than the diff logic in fossil can handle.
   293    330   **
   294    331   ************************************ WARNING **********************************
   295    332   **
   296    333   ** This function does not validate that the blob content is properly formed
   297    334   ** UTF-16.  It assumes that all code points are the same size.  It does not
   298    335   ** validate any code points.  It makes no attempt to detect if any [invalid]
   299    336   ** switches between the UTF-16be and UTF-16le encodings occur.
................................................................................
   321    358       if( c==0 ) return 0;  /* NUL character in a file -> binary */
   322    359       if( c==UTF16BE_LF || c==UTF16LE_LF ){
   323    360         int c2 = z[-1];
   324    361         if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
   325    362           result = -1;  /* Contains CR/NL, continue */
   326    363         }
   327    364         if( j>UTF16_LENGTH_MASK ){
   328         -        return 0;  /* Very long line -> binary */
          365  +        return -4;  /* Very long line -> binary */
   329    366         }
   330    367         j = 0;
   331    368       }
   332    369     }
   333    370     if( j>UTF16_LENGTH_MASK ){
   334         -    return 0;  /* Very long line -> binary */
          371  +    return -4;  /* Very long line -> binary */
   335    372     }
   336    373     return result;  /* No problems seen -> not binary */
   337    374   }
   338    375   
   339    376   /*
   340    377   ** This function returns an array of bytes representing the byte-order-mark
   341    378   ** for UTF-8.