Index: src/blob.c ================================================================== --- src/blob.c +++ src/blob.c @@ -1027,10 +1027,65 @@ if( z[i]!='\r' ) z[j++] = z[i]; } z[j] = 0; p->nUsed = j; } + +/* +** Convert blob from cp1252 to utf-8. As cp1252 is a superset +** of iso8895-1, this is useful on UNIX as well. +** +** This table contains the character translations for 0x80..0xA0. +*/ + +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + +void blob_cp1252_to_utf8(Blob *p){ + unsigned char *z = (unsigned char *)p->aData; + int j = p->nUsed; + int i, n; + for(i=n=0; i<j; i++){ + if( z[i]>=0x80 ){ + if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ + n++; + } + n++; + } + } + j += n; + if( j>=p->nAlloc ){ + blob_resize(p, j); + z = (unsigned char *)p->aData; + } + p->nUsed = j; + z[j] = 0; + while( j>i ){ + if( z[--i]>=0x80 ){ + if( z[i]<0xa0 ){ + unsigned short sym = cp1252[z[i]&0x1f]; + if( sym>=0x800 ){ + z[--j] = 0x80 | (sym&0x3f); + z[--j] = 0x80 | ((sym>>6)&0x3f); + z[--j] = 0xe0 | (sym>>12); + }else{ + z[--j] = 0x80 | (sym&0x3f); + z[--j] = 0xc0 | (sym>>6); + } + }else{ + z[--j] = 0x80 | (z[i]&0x3F); + z[--j] = 0xC0 | (z[i]>>6); + } + }else{ + z[--j] = z[i]; + } + } +} /* ** Shell-escape the given string. Append the result to a blob. */ void shell_escape(Blob *pBlob, const char *zIn){ Index: src/checkin.c ================================================================== --- src/checkin.c +++ src/checkin.c @@ -908,10 +908,55 @@ static int allOk = 0; /* Set to true to disable this routine */ if( allOk ) return 0; fUnicode = starts_with_utf16_bom(p, 0, 0); eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); + if( eType<-2){ + const char *zWarning; + const char *zDisable; + const char *zConvert; + Blob ans; + char cReply; + + if(eType==-4){ + if (binOk) goto go_on; + zWarning = "long lines"; + zDisable = "\"binary-glob\" setting"; + zConvert = ""; + }else{ + if (encodingOk) goto go_on; + zWarning = "invalid UTF-8"; + zDisable = "\"encoding-glob\" setting"; + zConvert = "c=convert/"; + } + blob_zero(&ans); + file_relative_name(zFilename, &fname, 0); + zMsg = mprintf( + "%s appears to be text, but contains %s. Use --no-warnings or the" + " %s to disable this warning.\nCommit anyhow (a=all/%sy/N)? ", + blob_str(&fname), zWarning, zDisable, zConvert); + prompt_user(zMsg, &ans); + fossil_free(zMsg); + cReply = blob_str(&ans)[0]; + if( *zConvert && (cReply=='c' || cReply=='C') ){ + char *zOrig = file_newname(zFilename, "original", 1); + FILE *f; + blob_write_to_file(p, zOrig); + fossil_free(zOrig); + f = fossil_fopen(zFilename, "wb"); + blob_cp1252_to_utf8(p); + fwrite(blob_buffer(p), 1, blob_size(p), f); + fclose(f); + return 1; + } else if( cReply!='y' && cReply!='Y' ){ + fossil_fatal("Abandoning commit due to %s in %s", + zWarning, blob_str(&fname)); + } + blob_reset(&ans); + go_on: + eType +=4 ; + } if( eType==0 || eType==-1 || fUnicode ){ const char *zWarning; const char *zDisable; const char *zConvert = "c=convert/"; Blob ans; Index: src/diff.c ================================================================== --- src/diff.c +++ src/diff.c @@ -57,11 +57,11 @@ "more than 10,000 changes\n" #define DIFF_TOO_MANY_CHANGES_HTML \ "<p class='generalError'>More than 10,000 changes</p>\n" -#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) +#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0) #endif /* INTERFACE */ /* ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) */ @@ -181,70 +181,103 @@ /* Return results */ *pnLine = nLine; return a; } +/* +** Macro which checks for proper UTF-8, when the first byte >= 0x80 +** It uses the method described in: +** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences +** except for the "overlong form" which is not considered +** invalid: Some languages like Java and Tcl use it. +** +** Any invalid byte causes bit 2 of result to be set (result |= 4), +** otherwise for valid multibyte utf-8 sequences n, j and z are +** updated so the continuation bytes are not checked again. + */ +#define CHECKUTF8(c) \ +if( c<0xC0 || c>=0xF8 ){ \ + result |= 4; /* Invalid 1-byte or multibyte UTF-8, continue */ \ +}else do{ \ + /* Check if all continuation bytes >=0x80 and <0xC0 */ \ + if( n<2 || ((z[1]&0xC0)!=0x80) ){ \ + result |= 4; /* Invalid continuation byte, continue */ \ + break; \ + }else{ \ + /* prepare for checking remaining continuation bytes */ \ + c<<=1; --n; ++j; ++z; \ + } \ +}while( c>=0xC0 ); + /* ** This function attempts to scan each logical line within the blob to ** determine the type of content it appears to contain. Possible return ** values are: ** ** (1) -- The content appears to consist entirely of text, with lines -** delimited by line-feed characters; however, the encoding may -** not be UTF-8. +** delimited by line-feed characters. ** ** (0) -- The content appears to be binary because it contains embedded ** NUL characters or an extremely long line. Since this function ** does not understand UTF-16, it may falsely consider UTF-16 text ** to be binary. ** ** (-1) -- The content appears to consist entirely of text, with lines -** delimited by carriage-return, line-feed pairs; however, the -** encoding may not be UTF-8. +** delimited by carriage-return, line-feed pairs. +** +** (-3, -5) The same as (1, -1); however, the encoding is not UTF-8 or ASCII. +** +** (-4) -- The same as 0, but the determination is based on the fact that +** the blob might be text (any encoding) but it has a line length +** bigger than the diff logic in fossil can handle. ** ************************************ WARNING ********************************** ** -** This function does not validate that the blob content is properly formed -** UTF-8. It assumes that all code points are the same size. It does not -** validate any code points. It makes no attempt to detect if any [invalid] -** switches between UTF-8 and other encodings occur. +** This function does not validate any code points. ** ** The only code points that this function cares about are the NUL character, ** carriage-return, and line-feed. ** ************************************ WARNING ********************************** */ int looks_like_utf8(const Blob *pContent){ - const char *z = blob_buffer(pContent); + const unsigned char *z = (unsigned char *) blob_buffer(pContent); unsigned int n = blob_size(pContent); - int j, c; - int result = 1; /* Assume UTF-8 text with no CR/NL */ + unsigned int j; + unsigned char c; + int result = 0; /* Assume UTF-8 text with no CR/NL */ /* Check individual lines. */ - if( n==0 ) return result; /* Empty file -> text */ + if( n==0 ) return 1; /* Empty file -> text */ c = *z; - if( c==0 ) return 0; /* Zero byte in a file -> binary */ j = (c!='\n'); + if( c&0x80 ){ + CHECKUTF8(c) + } else if( c==0 ){ + return 0; /* Zero byte in a file -> binary */ + } while( --n>0 ){ c = *++z; ++j; - if( c==0 ) return 0; /* Zero byte in a file -> binary */ - if( c=='\n' ){ - int c2 = z[-1]; - if( c2=='\r' ){ - result = -1; /* Contains CR/NL, continue */ + if( c&0x80 ){ + CHECKUTF8(c) + } else if( c==0 ){ + return 0; /* Zero byte in a file -> binary */ + } else if( c=='\n' ){ + if( z[-1]=='\r' ){ + result |= 2; /* Contains CR/NL, continue */ } if( j>LENGTH_MASK ){ - return 0; /* Very long line -> binary */ + return -4; /* Very long line -> binary */ } j = 0; } } if( j>LENGTH_MASK ){ - return 0; /* Very long line -> binary */ + return -4; /* Very long line -> binary */ } - return result; /* No problems seen -> not binary */ + return 1-result; /* No problems seen -> not binary */ } /* ** Define the type needed to represent a Unicode (UTF-16) character. */ @@ -288,10 +321,14 @@ ** to be binary. ** ** (-1) -- The content appears to consist entirely of text, with lines ** delimited by carriage-return, line-feed pairs; however, the ** encoding may not be UTF-16. +** +** (-4) -- The same as 0, but the determination is based on the fact that +** the blob might be text (any encoding) but it has a line length +** bigger than the diff logic in fossil can handle. ** ************************************ WARNING ********************************** ** ** This function does not validate that the blob content is properly formed ** UTF-16. It assumes that all code points are the same size. It does not @@ -323,17 +360,17 @@ int c2 = z[-1]; if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ result = -1; /* Contains CR/NL, continue */ } if( j>UTF16_LENGTH_MASK ){ - return 0; /* Very long line -> binary */ + return -4; /* Very long line -> binary */ } j = 0; } } if( j>UTF16_LENGTH_MASK ){ - return 0; /* Very long line -> binary */ + return -4; /* Very long line -> binary */ } return result; /* No problems seen -> not binary */ } /*