Changes On Branch simplify-starts-with
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch simplify-starts-with Excluding Merge-Ins

This is equivalent to a diff from 43c4522623 to c209105f0f

2013-02-07
15:28
Divide blob length check (even number of bytes) and UTF-32 check in the 3 versions of the UTF-16 BOM functions. check-in: be6756e26b user: jan.nijtmans tags: trunk
09:39
merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning
09:19
If file starts with UTF-32 BOM, always consider it binary without warning. Closed-Leaf check-in: c209105f0f user: jan.nijtmans tags: simplify-starts-with
08:47
Combine 4 "starts_with_utf??_bom" functions to a single - easier to use - function "starts_with_bom". In addition, it only checks for an UTF-16 BOM if the blob has an even number of bytes. check-in: 6c417d8bf5 user: jan.nijtmans tags: simplify-starts-with
02:08
Add the test-ssh-far-side command that can be used in place of a shell for the remote side of an ssh: sync. check-in: 43c4522623 user: drh tags: trunk
00:24
Add the shell= query parameter to the ssh: scheme for cloning and syncing. check-in: 2163cd9666 user: drh tags: trunk

Changes to src/blob.c.

  1093   1093   ** Strip a possible byte-order-mark (BOM) from the blob. On Windows, if there
  1094   1094   ** is either no BOM at all or an (le/be) UTF-16 BOM, a conversion to UTF-8 is
  1095   1095   ** done.  If useMbcs is false and there is no BOM, the input string is assumed
  1096   1096   ** to be UTF-8 already, so no conversion is done.
  1097   1097   */
  1098   1098   void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
  1099   1099     char *zUtf8;
  1100         -  int bomSize = 0;
  1101         -  if( starts_with_utf8_bom(pBlob, &bomSize) ){
         1100  +  int bomSize = starts_with_bom(pBlob);
         1101  +  if( bomSize == 3 ){
  1102   1102       struct Blob temp;
  1103   1103       zUtf8 = blob_str(pBlob) + bomSize;
  1104   1104       blob_zero(&temp);
  1105   1105       blob_append(&temp, zUtf8, -1);
  1106   1106       blob_swap(pBlob, &temp);
  1107   1107       blob_reset(&temp);
  1108   1108   #ifdef _WIN32
  1109         -  }else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
  1110         -    /* Make sure the blob contains two terminating 0-bytes */
  1111         -    blob_append(pBlob, "", 1);
  1112         -    zUtf8 = blob_str(pBlob) + bomSize;
  1113         -    zUtf8 = fossil_unicode_to_utf8(zUtf8);
  1114         -    blob_zero(pBlob);
  1115         -    blob_append(pBlob, zUtf8, -1);
  1116         -    fossil_unicode_free(zUtf8);
  1117         -  }else if( starts_with_utf16be_bom(pBlob, &bomSize) ){
  1118         -    unsigned int i = blob_size(pBlob);
         1109  +  }else if( bomSize == 2 ){
  1119   1110       zUtf8 = blob_buffer(pBlob);
  1120         -    while( i > 0 ){
  1121         -      /* swap bytes of unicode representation */
  1122         -      char zTemp = zUtf8[--i];
  1123         -      zUtf8[i] = zUtf8[i-1];
  1124         -      zUtf8[--i] = zTemp;
         1111  +    if (*((unsigned short *)zUtf8) == 0xfffe) {
         1112  +      /* Found BOM, but with reversed bytes */
         1113  +      unsigned int i = blob_size(pBlob);
         1114  +      while( i > 0 ){
         1115  +        /* swap bytes of unicode representation */
         1116  +        char zTemp = zUtf8[--i];
         1117  +        zUtf8[i] = zUtf8[i-1];
         1118  +        zUtf8[--i] = zTemp;
         1119  +      }
  1125   1120       }
  1126   1121       /* Make sure the blob contains two terminating 0-bytes */
  1127   1122       blob_append(pBlob, "", 1);
  1128   1123       zUtf8 = blob_str(pBlob) + bomSize;
  1129   1124       zUtf8 = fossil_unicode_to_utf8(zUtf8);
  1130   1125       blob_zero(pBlob);
  1131   1126       blob_append(pBlob, zUtf8, -1);

Changes to src/checkin.c.

   897    897     Blob *p,              /* The content of the file being committed. */
   898    898     int crnlOk,           /* Non-zero if CR/NL warnings should be disabled. */
   899    899     int binOk,            /* Non-zero if binary warnings should be disabled. */
   900    900     int encodingOk,        /* Non-zero if encoding warnings should be disabled. */
   901    901     const char *zFilename /* The full name of the file being committed. */
   902    902   ){
   903    903     int eType;              /* return value of looks_like_utf8/utf16() */
   904         -  int fUnicode;           /* return value of starts_with_utf16_bom() */
          904  +  int fUnicode;           /* 1 if  blob starts with UTF-16 BOM */
   905    905     char *zMsg;             /* Warning message */
   906    906     Blob fname;             /* Relative pathname of the file */
   907    907     static int allOk = 0;   /* Set to true to disable this routine */
   908    908   
   909    909     if( allOk ) return 0;
   910         -  fUnicode = starts_with_utf16_bom(p, 0);
          910  +  fUnicode = (starts_with_bom(p) == 2);
   911    911     eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
   912    912     if( eType==0 || eType==-1 || fUnicode ){
   913    913       const char *zWarning;
   914    914       const char *zDisable;
   915    915       const char *zConvert = "c=convert/";
   916    916       Blob ans;
   917    917       char cReply;

Changes to src/diff.c.

   338    338       0xEF, 0xBB, 0xBF, 0x00, 0x00, 0x00
   339    339     };
   340    340     if( pnByte ) *pnByte = 3;
   341    341     return bom;
   342    342   }
   343    343   
   344    344   /*
   345         -** This function returns non-zero if the blob starts with a UTF-8
   346         -** byte-order-mark (BOM).
          345  +** This function returns detected BOM size if the blob starts with
          346  +** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
   347    347   */
   348         -int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
          348  +int starts_with_bom(const Blob *pContent){
   349    349     const char *z = blob_buffer(pContent);
   350         -  int bomSize = 0;
          350  +  int c1, bomSize = 0;
   351    351     const unsigned char *bom = get_utf8_bom(&bomSize);
   352    352   
   353         -  if( pnByte ) *pnByte = bomSize;
   354         -  if( blob_size(pContent)<bomSize ) return 0;
   355         -  return memcmp(z, bom, bomSize)==0;
   356         -}
   357         -
   358         -/*
   359         -** This function returns non-zero if the blob starts with a UTF-16le or
   360         -** UTF-16be byte-order-mark (BOM).
   361         -*/
   362         -int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
   363         -  const char *z = blob_buffer(pContent);
   364         -  int c1, c2;
   365         -
   366         -  if( pnByte ) *pnByte = 2;
   367         -  if( blob_size(pContent)<2 ) return 0;
   368         -  c1 = z[0]; c2 = z[1];
   369         -  if( (c1==(char)0xff) && (c2==(char)0xfe) ){
   370         -    return 1;
   371         -  }else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
   372         -    return 1;
   373         -  }
   374         -  return 0;
   375         -}
   376         -
   377         -/*
   378         -** This function returns non-zero if the blob starts with a UTF-16le
   379         -** byte-order-mark (BOM).
   380         -*/
   381         -int starts_with_utf16le_bom(const Blob *pContent, int *pnByte){
   382         -  const char *z = blob_buffer(pContent);
   383         -  int c1, c2;
   384         -
   385         -  if( pnByte ) *pnByte = 2;
   386         -  if( blob_size(pContent)<2 ) return 0;
   387         -  c1 = z[0]; c2 = z[1];
   388         -  if( (c1==(char)0xff) && (c2==(char)0xfe) ){
   389         -    return 1;
   390         -  }
   391         -  return 0;
   392         -}
   393         -
   394         -/*
   395         -** This function returns non-zero if the blob starts with a UTF-16be
   396         -** byte-order-mark (BOM).
   397         -*/
   398         -int starts_with_utf16be_bom(const Blob *pContent, int *pnByte){
   399         -  const char *z = blob_buffer(pContent);
   400         -  int c1, c2;
   401         -
   402         -  if( pnByte ) *pnByte = 2;
   403         -  if( blob_size(pContent)<2 ) return 0;
   404         -  c1 = z[0]; c2 = z[1];
   405         -  if( (c1==(char)0xfe) && (c2==(char)0xff) ){
   406         -    return 1;
          353  +  if( (blob_size(pContent)>=bomSize)
          354  +      && (memcmp(z, bom, bomSize)==0) ){
          355  +    return bomSize;
          356  +  }
          357  +  /* Only accept UTF-16 BOM if the blob has an even number of bytes */
          358  +  if( (blob_size(pContent)<2) || (blob_size(pContent)&1) ) return 0;
          359  +  c1 = *((unsigned short *)z);
          360  +  if( (c1==0xfffe) || (c1==0xfeff) ){
          361  +    if( blob_size(pContent)>=4 ){
          362  +      /* For UTF-32 BOM, always return 0. */
          363  +      if( ((unsigned short *)z)[1] == 0 ) return 0;
          364  +    }
          365  +    return 2;
   407    366     }
   408    367     return 0;
   409    368   }
   410    369   
   411    370   /*
   412    371   ** Return true if two DLine elements are identical.
   413    372   */
................................................................................
  2367   2326   
  2368   2327     zLimit = find_option("limit",0,1);
  2369   2328     if( zLimit==0 || zLimit[0]==0 ) zLimit = "-1";
  2370   2329     iLimit = atoi(zLimit);
  2371   2330     showLog = find_option("log",0,0)!=0;
  2372   2331     fileVers = find_option("filevers",0,0)!=0;
  2373   2332     db_must_be_within_tree();
  2374         -  if (g.argc<3) {
         2333  +  if( g.argc<3 ){
  2375   2334       usage("FILENAME");
  2376   2335     }
  2377   2336     file_tree_name(g.argv[2], &treename, 1);
  2378   2337     zFilename = blob_str(&treename);
  2379   2338     fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", zFilename);
  2380   2339     if( fnid==0 ){
  2381   2340       fossil_fatal("no such file: %s", zFilename);
  2382   2341     }
  2383   2342     fid = db_int(0, "SELECT rid FROM vfile WHERE pathname=%Q", zFilename);
  2384   2343     if( fid==0 ){
  2385   2344       fossil_fatal("not part of current checkout: %s", zFilename);
  2386   2345     }
  2387   2346     cid = db_lget_int("checkout", 0);
  2388         -  if (cid == 0){
         2347  +  if( cid == 0 ){
  2389   2348       fossil_fatal("Not in a checkout");
  2390   2349     }
  2391   2350     if( iLimit<=0 ) iLimit = 1000000000;
  2392   2351     compute_direct_ancestors(cid, iLimit);
  2393   2352     mid = db_int(0, "SELECT mlink.mid FROM mlink, ancestor "
  2394   2353             " WHERE mlink.fid=%d AND mlink.fnid=%d AND mlink.mid=ancestor.rid"
  2395   2354             " ORDER BY ancestor.generation ASC LIMIT 1",