Fossil: Changes On Branch disallow-invalid-utf8-in-filenames

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch disallow-invalid-utf8-in-filenames Excluding Merge-Ins

This is equivalent to a diff from 1d462a683f to 7dabede3b3

2013-01-23
13:15		Further fine-tuning of the check for valid UTF8 characters in filenames. check-in: 4d456c9fd1 user: drh tags: trunk
10:38		put settings in right alphabetical order check-in: 4ddd099b57 user: jan.nijtmans tags: trunk
2013-01-21
13:12		Oops, make it work correct now. Closed-Leaf check-in: 7dabede3b3 user: jan.nijtmans tags: disallow-invalid-utf8-in-filenames
10:29		merge trunk check-in: a68dffbff3 user: jan.nijtmans tags: improve_commit_warning
09:39		From the changes.wiki for Fossil 1.25: "Disallow invalid UTF8 characters (such as characters in the surrogate pair range) in filenames." This completes the set of UTF8 characters which are generally considered invalid, so they should be disallowed in filenames: the "overlong form", invalid continuation bytes, and -finally- noncharacters. check-in: 011d5f692d user: jan.nijtmans tags: disallow-invalid-utf8-in-filenames
2013-01-20
10:57		Fix the SQL for the command-line timeline so that it works for timeline items that are not associated with a particular branch. check-in: 1d462a683f user: drh tags: trunk
2013-01-18
21:34		Run "analyze" after a rebuild. For small repositories, the time doesn't matter and for large repositories, the effect on the query plans are huge. Push/pull for example will otherwise do a sequential scan of the blob table and joining that with the unclustered table afterwards, when the other way around is several order of magnitudes more efficient... check-in: 80bf94e0f7 user: joerg tags: trunk

Changes to src/file.c.

   488    488   **     *  Does not contain any of these characters in the path: "\"
   489    489   **     *  Does not end with "/".
   490    490   **     *  Does not contain two or more "/" characters in a row.
   491    491   **     *  Contains at least one character
   492    492   **
   493    493   ** Invalid UTF8 characters result in a false return if bStrictUtf8 is
   494    494   ** true.  If bStrictUtf8 is false, invalid UTF8 characters are silently
   495         -** ignored.
          495  +** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
          496  +** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters)
   496    497   */
   497    498   int file_is_simple_pathname(const char *z, int bStrictUtf8){
   498    499     int i;
   499         -  char c = z[0];
          500  +  unsigned char c = (unsigned char) z[0];
   500    501     char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
   501    502     if( c=='/' || c==0 ) return 0;
   502    503     if( c=='.' ){
   503    504       if( z[1]=='/' || z[1]==0 ) return 0;
   504    505       if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0;
   505    506     }
   506         -  for(i=0; (c=z[i])!=0; i++){
          507  +  for(i=0; (c=(unsigned char)z[i])!=0; i++){
   507    508       if( c & maskNonAscii ){
   508         -      if( (c & 0xf0) == 0xf0 ) {
   509         -        /* Unicode characters > U+FFFF are not supported.
   510         -         * Windows XP and earlier cannot handle them.
   511         -         */
          509  +      if( c<0xc2 ){
          510  +        /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */
   512    511           return 0;
   513         -      }
   514         -      if( (c & 0xf0) == 0xe0 ) {
          512  +      }else if( (c&0xe0)==0xe0 ){
          513  +        /* 3-byte or more */
          514  +        int unicode;
          515  +        if( c&0x10 ){
          516  +          /* Unicode characters > U+FFFF are not supported.
          517  +           * Windows XP and earlier cannot handle them.
          518  +           */
          519  +          return 0;
          520  +        }
   515    521           /* This is a 3-byte UTF-8 character */
   516         -        if ( (c & 0xfe) == 0xee ){
   517         -          /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */
   518         -          if ( !(c & 1) || ((z[i+1] & 0xff) < 0xa4) ){
   519         -            /* Unicode character in the range U+E000 - U+F8FF are for
   520         -             * private use, they shouldn't occur in filenames.  */
          522  +        unicode = ((c&0x0f)<<12) + ((z[i+1]&0x3f)<<6) + (z[i+2]&0x3f);
          523  +        if( unicode <= 0x07ff ){
          524  +          /* overlong form */
          525  +          return 0;
          526  +        }else if( unicode>=0xe000 ){
          527  +          /* U+E000..U+FFFF */
          528  +          if( (unicode<=0xf8ff) || (unicode>=0xfffe) ){
          529  +            /* U+E000..U+F8FF are for private use.
          530  +             * U+FFFE..U+FFFF are noncharacters. */
          531  +            return 0;
          532  +          } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){
          533  +            /* U+FDD0..U+FDEF are noncharacters. */
   521    534               return 0;
   522    535             }
   523         -        }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){
   524         -          /* Unicode character in the range U+D800 - U+DFFF are for
   525         -           * surrogate pairs, they shouldn't occur in filenames. */
          536  +        }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){
          537  +          /* U+D800..U+DFFF are for surrogate pairs. */
   526    538             return 0;
   527    539           }
   528    540         }
          541  +      do{
          542  +        if( (z[i+1]&0xc0)!=0x80 ){
          543  +          /* Invalid continuation byte (multi-byte UTF-8) */
          544  +          return 0;
          545  +        }
          546  +        /* The hi-bits of c are used to keep track of the number of expected
          547  +         * continuation-bytes, so we don't need a separate counter. */
          548  +        c<<=1; ++i;
          549  +      }while( c>=0xc0 );
   529    550       }else if( c=='\\' ){
   530    551         return 0;
   531    552       }
   532    553       if( c=='/' ){
   533    554         if( z[i+1]=='/' ) return 0;
   534    555         if( z[i+1]=='.' ){
   535    556           if( z[i+2]=='/' || z[i+2]==0 ) return 0;
................................................................................
   576    597   #if defined(_WIN32)
   577    598     for(i=0; i<n; i++){
   578    599       if( z[i]=='\\' ) z[i] = '/';
   579    600     }
   580    601   #endif
   581    602   
   582    603     /* Removing trailing "/" characters */
   583         -  if ( !slash ){
          604  +  if( !slash ){
   584    605       while( n>1 && z[n-1]=='/' ){ n--; }
   585    606     }
   586    607   
   587    608     /* Remove duplicate '/' characters.  Except, two // at the beginning
   588    609     ** of a pathname is allowed since this is important on windows. */
   589    610     for(i=j=1; i<n; i++){
   590    611       z[j++] = z[i];
................................................................................
   833    854       if( zPath[i]==0 ){
   834    855         blob_reset(pOut);
   835    856         if( zPwd[i]==0 ){
   836    857           blob_append(pOut, ".", 1);
   837    858         }else{
   838    859           blob_append(pOut, "..", 2);
   839    860           for(j=i+1; zPwd[j]; j++){
   840         -          if( zPwd[j]=='/' ) {
          861  +          if( zPwd[j]=='/' ){
   841    862               blob_append(pOut, "/..", 3);
   842    863             }
   843    864           }
   844    865         }
   845    866         return;
   846    867       }
   847    868       if( zPwd[i]==0 && zPath[i]=='/' ){
................................................................................
   850    871         blob_append(pOut, &zPath[i+1], -1);
   851    872         blob_reset(&tmp);
   852    873         return;
   853    874       }
   854    875       while( zPath[i-1]!='/' ){ i--; }
   855    876       blob_set(&tmp, "../");
   856    877       for(j=i; zPwd[j]; j++){
   857         -      if( zPwd[j]=='/' ) {
          878  +      if( zPwd[j]=='/' ){
   858    879           blob_append(&tmp, "../", 3);
   859    880         }
   860    881       }
   861    882       blob_append(&tmp, &zPath[i], -1);
   862    883       blob_reset(pOut);
   863    884       memcpy(pOut, &tmp, sizeof(tmp));
   864    885     }