Index: src/blob.c ================================================================== --- src/blob.c +++ src/blob.c @@ -1091,36 +1091,42 @@ /* ** Strip a possible BOM from the blob. On Windows, if there ** is either no BOM at all or an (le/be) UTF-16 BOM, a conversion ** to UTF-8 is done. -** If useMbcs is false and there is no BOM, the input string +** If useMbcs is 0 and there is no BOM, the input string ** is assumed to be UTF-8 already, so no conversion is done. +** If useMbcs is 2, any BOM is replaced by the UTF-8 BOM */ void blob_strip_bom(Blob *pBlob, int useMbcs){ static const unsigned char bom[] = { 0xEF, 0xBB, 0xBF }; #ifdef _WIN32 static const unsigned short ubom = 0xfeff; static const unsigned short urbom = 0xfffe; #endif /* _WIN32 */ char *zUtf8; if( blob_size(pBlob)>2 && memcmp(blob_buffer(pBlob), bom, 3)==0 ) { - struct Blob temp; - zUtf8 = blob_str(pBlob) + 3; - blob_zero(&temp); - blob_append(&temp, zUtf8, -1); - fossil_mbcs_free(zUtf8); - blob_swap(pBlob, &temp); - blob_reset(&temp); + if( useMbcs<2 ){ + struct Blob temp; + zUtf8 = blob_str(pBlob) + 3; + blob_zero(&temp); + blob_append(&temp, zUtf8, -1); + fossil_mbcs_free(zUtf8); + blob_swap(pBlob, &temp); + blob_reset(&temp); + } #ifdef _WIN32 }else if( blob_size(pBlob)>1 && (blob_size(pBlob)&1)==0 && memcmp(blob_buffer(pBlob), &ubom, 2)==0 ) { /* Make sure the blob contains two terminating 0-bytes */ blob_append(pBlob, "", 1); zUtf8 = blob_str(pBlob) + 2; zUtf8 = fossil_unicode_to_utf8(zUtf8); blob_zero(pBlob); + if( useMbcs>1 ){ + blob_append(pBlob, (char*)bom, 3); + } blob_append(pBlob, zUtf8, -1); fossil_mbcs_free(zUtf8); }else if( blob_size(pBlob)>1 && (blob_size(pBlob)&1)==0 && memcmp(blob_buffer(pBlob), &urbom, 2)==0 ) { unsigned int i = blob_size(pBlob); @@ -1134,15 +1140,18 @@ /* Make sure the blob contains two terminating 0-bytes */ blob_append(pBlob, "", 1); zUtf8 = blob_str(pBlob) + 2; zUtf8 = fossil_unicode_to_utf8(zUtf8); blob_zero(pBlob); + if( useMbcs>1 ){ + blob_append(pBlob, (char*)bom, 3); + } blob_append(pBlob, zUtf8, -1); fossil_mbcs_free(zUtf8); - }else if (useMbcs) { + }else if (useMbcs==1) { zUtf8 = fossil_mbcs_to_utf8(blob_str(pBlob)); blob_zero(pBlob); blob_append(pBlob, zUtf8, -1); fossil_mbcs_free(zUtf8); #endif /* _WIN32 */ } } Index: src/diff.c ================================================================== --- src/diff.c +++ src/diff.c @@ -45,14 +45,17 @@ ** here for consistency. */ #define DIFF_CANNOT_COMPUTE_BINARY \ "cannot compute difference between binary files\n" +#define DIFF_CANNOT_COMPUTE_ENCODING \ + "cannot compute difference between files with different encodings\n" + #define DIFF_CANNOT_COMPUTE_SYMLINK \ "cannot compute difference between symlink and regular file\n" -#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) +#define looks_like_text(blob) (looks_like_utf8(blob)&3) #endif /* INTERFACE */ /* ** Maximum length of a line in a text file, in bytes. (8192) */ @@ -183,11 +186,13 @@ ** (0) -- The content appears to be binary because it contains embedded ** NUL characters or an extremely long line. Since this function ** does not understand UTF-16, it may falsely consider UTF-16 text ** to be binary. ** -** (-1) -- The content appears to consist entirely of text, with lines +** (-1,-2) UTF-16 (le/be) +** +** (-3) -- The content appears to consist entirely of text, with lines ** delimited by carriage-return, line-feed pairs; however, the ** encoding may not be UTF-8. ** ************************************ WARNING ********************************** ** @@ -211,10 +216,50 @@ */ if( n==0 ) return result; /* Empty file -> text */ c = *z; if( c==0 ) return 0; /* Zero byte in a file -> binary */ j = (c!='\n'); + if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */ + if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */ + result = -1; + while( (n-=2)>0 ){ + c = *(z+=2); ++j; + if( z[1]==0 ){ /* High-byte must be 0 for further checks */ + if( c==0 ) return 0; /* Zero char in a file -> binary */ + if( c=='\n' ){ + if( j>LENGTH_MASK ){ + return 0; /* Very long line -> binary */ + } + j = 0; + } + } + if( j>LENGTH_MASK ){ + return 0; /* Very long line -> binary */ + } + } + return result; + } else if ( (c==0xfe) && (z[1]==0xff) ){ /* UTF-16 BE BOM */ + result = -2; + ++z; + while( (n-=2)>0 ){ + c = *(z+=2); ++j; + if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ + if( c==0 ) return 0; /* Zero char in a file -> binary */ + if( c=='\n' ){ + if( j>LENGTH_MASK ){ + return 0; /* Very long line -> binary */ + } + j = 0; + } + } + if( j>LENGTH_MASK ){ + return 0; /* Very long line -> binary */ + } + } + return result; + } + } while( --n>0 ){ c = *++z; ++j; if( c==0 ) return 0; /* Zero byte in a file -> binary */ if( c=='\n' ){ int c2 = z[-1]; @@ -243,16 +288,16 @@ # define WCHAR_T unsigned short # endif #endif /* -** Maximum length of a line in a text file, in UTF-16 characters. (4096) -** The number of bytes represented by this value cannot exceed LENGTH_MASK -** bytes, because that is the line buffer size used by the diff engine. +** Maximum length of a line in a text file, in UTF-16 characters. (2731) +** The number of characters represented by this value cannot exceed +** LENGTH_UTF16_LENGTH_MASK characters, because when converting UTF-16 +** to UTF-8 it could overflow the line buffer used by the diff engine. */ -#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) -#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) +#define UTF16_LENGTH_MASK (LENGTH_MASK/3) /* ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le ** encodings. */ Index: src/diffcmd.c ================================================================== --- src/diffcmd.c +++ src/diffcmd.c @@ -76,11 +76,11 @@ ** for file names to treat as binary. If fIncludeBinary is zero, these files ** will be skipped in addition to files that may contain binary content. */ void diff_file( Blob *pFile1, /* In memory content to compare from */ - int isBin1, /* Does the 'from' content appear to be binary */ + int eType1, /* Does the 'from' content appear to be text */ const char *zFile2, /* On disk content to compare to */ const char *zName, /* Display name of the file */ const char *zDiffCmd, /* Command for comparison */ const char *zBinGlob, /* Treat file names matching this as binary */ int fIncludeBinary, /* Include binary files for external diff */ @@ -88,10 +88,11 @@ ){ if( zDiffCmd==0 ){ Blob out; /* Diff output text */ Blob file2; /* Content of zFile2 */ const char *zName2; /* Name of zFile2 for display */ + int eType2 = 0; /* Read content of zFile2 into memory */ blob_zero(&file2); if( file_wd_size(zFile2)<0 ){ zName2 = NULL_DEVICE; @@ -101,17 +102,23 @@ }else{ blob_read_from_file(&file2, zFile2); } zName2 = zName; } - + if( !fIncludeBinary ){ + eType2 = looks_like_text(&file2); + } /* Compute and output the differences */ if( diffFlags & DIFF_BRIEF ){ if( blob_compare(pFile1, &file2) ){ fossil_print("CHANGED %s\n", zName); } + }else if( eType1!=eType2 ){ + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); }else{ + blob_strip_bom(pFile1, 2); + blob_strip_bom(&file2, 2); blob_zero(&out); text_diff(pFile1, &file2, &out, diffFlags); if( blob_size(&out) ){ diff_print_filenames(zName, zName2, diffFlags); fossil_print("%s\n", blob_str(&out)); @@ -126,11 +133,12 @@ Blob nameFile1; /* Name of temporary file to old pFile1 content */ Blob cmd; /* Text of command to run */ if( !fIncludeBinary ){ Blob file2; - if( isBin1 ){ + int eType2; + if( eType1!=1 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); return; } if( zBinGlob ){ Glob *pBinary = glob_create(zBinGlob); @@ -147,11 +155,12 @@ blob_read_link(&file2, zFile2); }else{ blob_read_from_file(&file2, zFile2); } } - if( looks_like_binary(&file2) ){ + eType2 = looks_like_text(&file2); + if( eType2!=1 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); blob_reset(&file2); return; } blob_reset(&file2); @@ -197,12 +206,11 @@ ** will be skipped in addition to files that may contain binary content. */ void diff_file_mem( Blob *pFile1, /* In memory content to compare from */ Blob *pFile2, /* In memory content to compare to */ - int isBin1, /* Does the 'from' content appear to be binary */ - int isBin2, /* Does the 'to' content appear to be binary */ + int eType, /* Does the content appear to be text */ const char *zName, /* Display name of the file */ const char *zDiffCmd, /* Command for comparison */ const char *zBinGlob, /* Treat file names matching this as binary */ int fIncludeBinary, /* Include binary files for external diff */ u64 diffFlags /* Diff flags */ @@ -210,10 +218,12 @@ if( diffFlags & DIFF_BRIEF ) return; if( zDiffCmd==0 ){ Blob out; /* Diff output text */ blob_zero(&out); + blob_strip_bom(pFile1, 2); + blob_strip_bom(pFile2, 2); text_diff(pFile1, pFile2, &out, diffFlags); diff_print_filenames(zName, zName, diffFlags); fossil_print("%s\n", blob_str(&out)); /* Release memory resources */ @@ -222,11 +232,11 @@ Blob cmd; char zTemp1[300]; char zTemp2[300]; if( !fIncludeBinary ){ - if( isBin1 || isBin2 ){ + if( eType==0 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); return; } if( zBinGlob ){ Glob *pBinary = glob_create(zBinGlob); @@ -282,18 +292,18 @@ const char *zFileTreeName ){ Blob fname; Blob content; int isLink; - int isBin; + int eType = 0; file_tree_name(zFileTreeName, &fname, 1); historical_version_of_file(zFrom, blob_str(&fname), &content, &isLink, 0, - fIncludeBinary ? 0 : &isBin, 0); + fIncludeBinary ? 0 : &eType, 0); if( !isLink != !file_wd_islink(zFrom) ){ fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); }else{ - diff_file(&content, isBin, zFileTreeName, zFileTreeName, + diff_file(&content, eType, zFileTreeName, zFileTreeName, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); } blob_reset(&content); blob_reset(&fname); } @@ -389,11 +399,11 @@ srcid = 0; if( !asNewFile ){ showDiff = 0; } } if( showDiff ){ Blob content; - int isBin; + int eType = 0; if( !isLink != !file_wd_islink(zFullName) ){ diff_print_index(zPathname, diffFlags); diff_print_filenames(zPathname, zPathname, diffFlags); fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); continue; @@ -401,13 +411,15 @@ if( srcid>0 ){ content_get(srcid, &content); }else{ blob_zero(&content); } - isBin = fIncludeBinary ? 0 : looks_like_binary(&content); + if( !fIncludeBinary ){ + eType = looks_like_text(&content); + } diff_print_index(zPathname, diffFlags); - diff_file(&content, isBin, zFullName, zPathname, zDiffCmd, + diff_file(&content, eType, zFullName, zPathname, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); blob_reset(&content); } free(zToFree); } @@ -437,23 +449,26 @@ ){ char *zName; Blob fname; Blob v1, v2; int isLink1, isLink2; - int isBin1, isBin2; + int eType = 0, eType2 = 0; if( diffFlags & DIFF_BRIEF ) return; file_tree_name(zFileTreeName, &fname, 1); zName = blob_str(&fname); historical_version_of_file(zFrom, zName, &v1, &isLink1, 0, - fIncludeBinary ? 0 : &isBin1, 0); + fIncludeBinary ? 0 : &eType, 0); historical_version_of_file(zTo, zName, &v2, &isLink2, 0, - fIncludeBinary ? 0 : &isBin2, 0); + fIncludeBinary ? 0 : &eType2, 0); if( isLink1 != isLink2 ){ diff_print_filenames(zName, zName, diffFlags); fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); + }else if( eType!=eType2 ){ + diff_print_filenames(zName, zName, diffFlags); + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); }else{ - diff_file_mem(&v1, &v2, isBin1, isBin2, zName, zDiffCmd, + diff_file_mem(&v1, &v2, eType, zName, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); } blob_reset(&v1); blob_reset(&v2); blob_reset(&fname); @@ -477,11 +492,11 @@ const char *zBinGlob, int fIncludeBinary, u64 diffFlags ){ Blob f1, f2; - int isBin1, isBin2; + int eType = 0, eType2 = 0; int rid; const char *zName = pFrom ? pFrom->zName : pTo->zName; if( diffFlags & DIFF_BRIEF ) return; diff_print_index(zName, diffFlags); if( pFrom ){ @@ -494,14 +509,21 @@ rid = uuid_to_rid(pTo->zUuid, 0); content_get(rid, &f2); }else{ blob_zero(&f2); } - isBin1 = fIncludeBinary ? 0 : looks_like_binary(&f1); - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&f2); - diff_file_mem(&f1, &f2, isBin1, isBin2, zName, zDiffCmd, - zBinGlob, fIncludeBinary, diffFlags); + if ( !fIncludeBinary ){ + eType = looks_like_text(&f1); + eType2 = looks_like_text(&f2); + } + if( eType!=eType2 ){ + diff_print_filenames(zName, zName, diffFlags); + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); + }else{ + diff_file_mem(&f1, &f2, eType, zName, zDiffCmd, + zBinGlob, fIncludeBinary, diffFlags); + } blob_reset(&f1); blob_reset(&f2); } /* Index: src/stash.c ================================================================== --- src/stash.c +++ src/stash.c @@ -306,22 +306,23 @@ ); while( db_step(&q)==SQLITE_ROW ){ int rid = db_column_int(&q, 0); int isRemoved = db_column_int(&q, 1); int isLink = db_column_int(&q, 3); - int isBin1, isBin2; + int eType = 0; const char *zOrig = db_column_text(&q, 4); const char *zNew = db_column_text(&q, 5); char *zOPath = mprintf("%s%s", g.zLocalRoot, zOrig); Blob delta, a, b, disk; if( rid==0 ){ db_ephemeral_blob(&q, 6, &a); fossil_print("ADDED %s\n", zNew); diff_print_index(zNew, diffFlags); - isBin1 = 0; - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&a); - diff_file_mem(&empty, &a, isBin1, isBin2, zNew, zDiffCmd, + if( !fIncludeBinary ){ + eType = looks_like_text(&a); + } + diff_file_mem(&empty, &a, eType, zNew, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); }else if( isRemoved ){ fossil_print("DELETE %s\n", zOrig); if( fBaseline==0 ){ if( file_wd_islink(zOPath) ){ @@ -331,13 +332,14 @@ } }else{ content_get(rid, &a); } diff_print_index(zNew, diffFlags); - isBin1 = fIncludeBinary ? 0 : looks_like_binary(&a); - isBin2 = 0; - diff_file_mem(&a, &empty, isBin1, isBin2, zOrig, zDiffCmd, + if( !fIncludeBinary){ + eType = looks_like_text(&a); + } + diff_file_mem(&a, &empty, eType, zOrig, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); }else{ int isOrigLink = file_wd_islink(zOPath); db_ephemeral_blob(&q, 6, &delta); if( fBaseline==0 ){ @@ -354,21 +356,29 @@ printf(DIFF_CANNOT_COMPUTE_SYMLINK); }else{ Blob *pBase = fBaseline ? &a : &disk; content_get(rid, &a); blob_delta_apply(&a, &delta, &b); - isBin1 = fIncludeBinary ? 0 : looks_like_binary(pBase); - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&b); - diff_file_mem(fBaseline? &a : &disk, &b, isBin1, isBin2, zNew, - zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); + int eType2 = 0; + if( !fIncludeBinary ){ + eType = looks_like_text(pBase); + eType2 = looks_like_text(&b); + } + if( eType!=eType2 ){ + diff_print_filenames(zOrig, zNew, diffFlags); + printf(DIFF_CANNOT_COMPUTE_ENCODING); + }else{ + diff_file_mem(pBase, &b, eType, zNew, zDiffCmd, + zBinGlob, fIncludeBinary, diffFlags); + } blob_reset(&a); blob_reset(&b); } if( !fBaseline ) blob_reset(&disk); } blob_reset(&delta); - } + } db_finalize(&q); } /* ** Drop the indicated stash Index: src/update.c ================================================================== --- src/update.c +++ src/update.c @@ -600,11 +600,11 @@ const char *revision, /* The checkin containing the file */ const char *file, /* Full treename of the file */ Blob *content, /* Put the content here */ int *pIsLink, /* Set to true if file is link. */ int *pIsExe, /* Set to true if file is executable */ - int *pIsBin, /* Set to true if file is binary */ + int *pEType, /* Set to file type, look_like_text()&3 */ int errCode /* Error code if file not found. Panic if 0. */ ){ Manifest *pManifest; ManifestFile *pFile; int rid=0; @@ -627,12 +627,12 @@ rid = uuid_to_rid(pFile->zUuid, 0); if( pIsExe ) *pIsExe = ( manifest_file_mperm(pFile)==PERM_EXE ); if( pIsLink ) *pIsLink = ( manifest_file_mperm(pFile)==PERM_LNK ); manifest_destroy(pManifest); rc = content_get(rid, content); - if( rc && pIsBin ){ - *pIsBin = looks_like_binary(content); + if( rc && pEType ){ + *pEType = looks_like_text(content); } return rc; } manifest_destroy(pManifest); if( errCode<=0 ){