git/utf8.h at 764bd200e1077b44f291973a623debccf151fcb2 - mirrors/git - Incest Forge: Beyond sex. We incest.

mirrors/git

mirror of https://github.com/git/git.git synced 2024-10-31 06:17:56 +01:00

Torsten Bögershausen aab2a1ae48 Support working-tree-encoding "UTF-16LE-BOM"

Users who want UTF-16 files in the working tree set the .gitattributes
like this:
test.txt working-tree-encoding=UTF-16

The unicode standard itself defines 3 allowed ways how to encode UTF-16.
The following 3 versions convert all back to 'g' 'i' 't' in UTF-8:

a) UTF-16, without BOM, big endian:
$ printf "\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

b) UTF-16, with BOM, little endian:
$ printf "\377\376g\000i\000t\000" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

c) UTF-16, with BOM, big endian:
$ printf "\376\377\000g\000i\000t" | iconv -f UTF-16 -t UTF-8 | od -c
0000000    g   i   t

Git uses libiconv to convert from UTF-8 in the index into ITF-16 in the
working tree.
After a checkout, the resulting file has a BOM and is encoded in "UTF-16",
in the version (c) above.
This is what iconv generates, more details follow below.

iconv (and libiconv) can generate UTF-16, UTF-16LE or UTF-16BE:

d) UTF-16
$ printf 'git' | iconv -f UTF-8 -t UTF-16 | od -c
0000000  376 377  \0   g  \0   i  \0   t

e) UTF-16LE
$ printf 'git' | iconv -f UTF-8 -t UTF-16LE | od -c
0000000    g  \0   i  \0   t  \0

f)  UTF-16BE
$ printf 'git' | iconv -f UTF-8 -t UTF-16BE | od -c
0000000   \0   g  \0   i  \0   t

There is no way to generate version (b) from above in a Git working tree,
but that is what some applications need.
(All fully unicode aware applications should be able to read all 3 variants,
but in practise we are not there yet).

When producing UTF-16 as an output, iconv generates the big endian version
with a BOM. (big endian is probably chosen for historical reasons).

iconv can produce UTF-16 files with little endianess by using "UTF-16LE"
as encoding, and that file does not have a BOM.

Not all users (especially under Windows) are happy with this.
Some tools are not fully unicode aware and can only handle version (b).

Today there is no way to produce version (b) with iconv (or libiconv).
Looking into the history of iconv, it seems as if version (c) will
be used in all future iconv versions (for compatibility reasons).

Solve this dilemma and introduce a Git-specific "UTF-16LE-BOM".
libiconv can not handle the encoding, so Git pick it up, handles the BOM
and uses libiconv to convert the rest of the stream.
(UTF-16BE-BOM is added for consistency)

Rported-by: Adrián Gimeno Balaguer <adrigibal@gmail.com>
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>

2019-01-31 10:27:52 -08:00

108 lines

3.6 KiB

C

Raw Blame History

 #ifndef GIT_UTF8_H
 #define GIT_UTF8_H
 struct strbuf;
 typedef unsigned int ucs_char_t;  /* assuming 32bit int */
 size_t display_mode_esc_sequence_len(const char *s);
 int utf8_width(const char **start, size_t *remainder_p);
 int utf8_strnwidth(const char *string, int len, int skip_ansi);
 int utf8_strwidth(const char *string);
 int is_utf8(const char *text);
 int is_encoding_utf8(const char *name);
 int same_encoding(const char *, const char *);
 __attribute__((format (printf, 2, 3)))
 int utf8_fprintf(FILE *, const char *, ...);
 extern const char utf8_bom[];
 extern int skip_utf8_bom(char **, size_t);
 void strbuf_add_wrapped_text(struct strbuf *buf,
 		const char *text, int indent, int indent2, int width);
 void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
 			     int indent, int indent2, int width);
 void strbuf_utf8_replace(struct strbuf *sb, int pos, int width,
 			 const char *subst);
 #ifndef NO_ICONV
 char *reencode_string_iconv(const char *in, size_t insz,
 			    iconv_t conv, size_t bom_len, size_t *outsz);
 char *reencode_string_len(const char *in, size_t insz,
 			  const char *out_encoding,
 			  const char *in_encoding,
 			  size_t *outsz);
 #else
 static inline char *reencode_string_len(const char *a, size_t b,
 					const char *c, const char *d, size_t *e)
 { if (e) *e = 0; return NULL; }
 #endif
 static inline char *reencode_string(const char *in,
 				    const char *out_encoding,
 				    const char *in_encoding)
 {
 	return reencode_string_len(in, strlen(in),
 				   out_encoding, in_encoding,
 				   NULL);
 }
 int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding);
 /*
  * Returns true if the path would match ".git" after HFS case-folding.
  * The path should be NUL-terminated, but we will match variants of both ".git\0"
  * and ".git/..." (but _not_ ".../.git"). This makes it suitable for both fsck
  * and verify_path().
  *
  * Likewise, the is_hfs_dotgitfoo() variants look for ".gitfoo".
  */
 int is_hfs_dotgit(const char *path);
 int is_hfs_dotgitmodules(const char *path);
 int is_hfs_dotgitignore(const char *path);
 int is_hfs_dotgitattributes(const char *path);
 typedef enum {
 	ALIGN_LEFT,
 	ALIGN_MIDDLE,
 	ALIGN_RIGHT
 } align_type;
 /*
  * Align the string given and store it into a strbuf as per the
  * 'position' and 'width'. If the given string length is larger than
  * 'width' than then the input string is not truncated and no
  * alignment is done.
  */
 void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width,
 		       const char *s);
 /*
  * If a data stream is declared as UTF-16BE or UTF-16LE, then a UTF-16
  * BOM must not be used [1]. The same applies for the UTF-32 equivalents.
  * The function returns true if this rule is violated.
  *
  * [1] http://unicode.org/faq/utf_bom.html#bom10
  */
 int has_prohibited_utf_bom(const char *enc, const char *data, size_t len);
 /*
  * If the endianness is not defined in the encoding name, then we
  * require a BOM. The function returns true if a required BOM is missing.
  *
  * The Unicode standard instructs to assume big-endian if there in no
  * BOM for UTF-16/32 [1][2]. However, the W3C/WHATWG encoding standard
  * used in HTML5 recommends to assume little-endian to "deal with
  * deployed content" [3].
  *
  * Therefore, strictly requiring a BOM seems to be the safest option for
  * content in Git.
  *
  * [1] http://unicode.org/faq/utf_bom.html#gen6
  * [2] http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
  *     Section 3.10, D98, page 132
  * [3] https://encoding.spec.whatwg.org/#utf-16le
  */
 int is_missing_required_utf_bom(const char *enc, const char *data, size_t len);
 #endif