xref: /openssh-portable/utf8.c (revision ac284a35)
1 /* $OpenBSD: utf8.c,v 1.2 2016/05/30 12:05:56 schwarze Exp $ */
2 /*
3  * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 /*
19  * Utility functions for multibyte-character handling,
20  * in particular to sanitize untrusted strings for terminal output.
21  */
22 
23 #include <sys/types.h>
24 #include <langinfo.h>
25 #include <limits.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <vis.h>
31 #include <wchar.h>
32 
33 #include "utf8.h"
34 
35 static int	 dangerous_locale(void);
36 static int	 vasnmprintf(char **, size_t, int *, const char *, va_list);
37 
38 
39 /*
40  * For US-ASCII and UTF-8 encodings, we can safely recover from
41  * encoding errors and from non-printable characters.  For any
42  * other encodings, err to the side of caution and abort parsing:
43  * For state-dependent encodings, recovery is impossible.
44  * For arbitrary encodings, replacement of non-printable
45  * characters would be non-trivial and too fragile.
46  */
47 
48 static int
49 dangerous_locale(void) {
50 	char	*loc;
51 
52 	loc = nl_langinfo(CODESET);
53 	return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
54 }
55 
56 /*
57  * The following two functions limit the number of bytes written,
58  * including the terminating '\0', to sz.  Unless wp is NULL,
59  * they limit the number of display columns occupied to *wp.
60  * Whichever is reached first terminates the output string.
61  * To stay close to the standard interfaces, they return the number of
62  * non-NUL bytes that would have been written if both were unlimited.
63  * If wp is NULL, newline, carriage return, and tab are allowed;
64  * otherwise, the actual number of columns occupied by what was
65  * written is returned in *wp.
66  */
67 
68 static int
69 vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
70 {
71 	char	*src;	/* Source string returned from vasprintf. */
72 	char	*sp;	/* Pointer into src. */
73 	char	*dst;	/* Destination string to be returned. */
74 	char	*dp;	/* Pointer into dst. */
75 	char	*tp;	/* Temporary pointer for dst. */
76 	size_t	 sz;	/* Number of bytes allocated for dst. */
77 	size_t	 tsz;	/* Temporary size while extending dst. */
78 	wchar_t	 wc;	/* Wide character at sp. */
79 	int	 len;	/* Number of bytes in the character at sp. */
80 	int	 ret;	/* Number of bytes needed to format src. */
81 	int	 width;	/* Display width of the character wc. */
82 	int	 total_width, max_width, print;
83 
84 	src = NULL;
85 	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
86 		goto fail;
87 
88 	sz = strlen(src);
89 	if ((dst = malloc(sz)) == NULL) {
90 		free(src);
91 		goto fail;
92 	}
93 
94 	if (maxsz > INT_MAX)
95 		maxsz = INT_MAX;
96 
97 	sp = src;
98 	dp = dst;
99 	ret = 0;
100 	print = 1;
101 	total_width = 0;
102 	max_width = wp == NULL ? INT_MAX : *wp;
103 	while (*sp != '\0') {
104 		if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
105 			(void)mbtowc(NULL, NULL, MB_CUR_MAX);
106 			if (dangerous_locale()) {
107 				ret = -1;
108 				break;
109 			}
110 			len = 1;
111 			width = -1;
112 		} else if (wp == NULL &&
113 		    (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
114 			/*
115 			 * Don't use width uninitialized; the actual
116 			 * value doesn't matter because total_width
117 			 * is only returned for wp != NULL.
118 			 */
119 			width = 0;
120 		} else if ((width = wcwidth(wc)) == -1 &&
121 		    dangerous_locale()) {
122 			ret = -1;
123 			break;
124 		}
125 
126 		/* Valid, printable character. */
127 
128 		if (width >= 0) {
129 			if (print && (dp - dst >= (int)maxsz - len ||
130 			    total_width > max_width - width))
131 				print = 0;
132 			if (print) {
133 				total_width += width;
134 				memcpy(dp, sp, len);
135 				dp += len;
136 			}
137 			sp += len;
138 			if (ret >= 0)
139 				ret += len;
140 			continue;
141 		}
142 
143 		/* Escaping required. */
144 
145 		while (len > 0) {
146 			if (print && (dp - dst >= (int)maxsz - 4 ||
147 			    total_width > max_width - 4))
148 				print = 0;
149 			if (print) {
150 				if (dp + 4 >= dst + sz) {
151 					tsz = sz + 128;
152 					if (tsz > maxsz)
153 						tsz = maxsz;
154 					tp = realloc(dst, tsz);
155 					if (tp == NULL) {
156 						ret = -1;
157 						break;
158 					}
159 					dp = tp + (dp - dst);
160 					dst = tp;
161 					sz = tsz;
162 				}
163 				tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
164 				width = tp - dp;
165 				total_width += width;
166 				dp = tp;
167 			} else
168 				width = 4;
169 			len--;
170 			sp++;
171 			if (ret >= 0)
172 				ret += width;
173 		}
174 		if (len > 0)
175 			break;
176 	}
177 	free(src);
178 	*dp = '\0';
179 	*str = dst;
180 	if (wp != NULL)
181 		*wp = total_width;
182 
183 	/*
184 	 * If the string was truncated by the width limit but
185 	 * would have fit into the size limit, the only sane way
186 	 * to report the problem is using the return value, such
187 	 * that the usual idiom "if (ret < 0 || ret >= sz) error"
188 	 * works as expected.
189 	 */
190 
191 	if (ret < (int)maxsz && !print)
192 		ret = -1;
193 	return ret;
194 
195 fail:
196 	if (wp != NULL)
197 		*wp = 0;
198 	if (ret == 0) {
199 		*str = src;
200 		return 0;
201 	} else {
202 		*str = NULL;
203 		return -1;
204 	}
205 }
206 
207 int
208 snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
209 {
210 	va_list	 ap;
211 	char	*cp;
212 	int	 ret;
213 
214 	va_start(ap, fmt);
215 	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
216 	va_end(ap);
217 	if (cp != NULL) {
218 		(void)strlcpy(str, cp, sz);
219 		free(cp);
220 	} else
221 		*str = '\0';
222 	return ret;
223 }
224 
225 /*
226  * To stay close to the standard interfaces, the following functions
227  * return the number of non-NUL bytes written.
228  */
229 
230 int
231 vfmprintf(FILE *stream, const char *fmt, va_list ap)
232 {
233 	char	*str;
234 	int	 ret;
235 
236 	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
237 		return -1;
238 	if (fputs(str, stream) == EOF)
239 		ret = -1;
240 	free(str);
241 	return ret;
242 }
243 
244 int
245 fmprintf(FILE *stream, const char *fmt, ...)
246 {
247 	va_list	 ap;
248 	int	 ret;
249 
250 	va_start(ap, fmt);
251 	ret = vfmprintf(stream, fmt, ap);
252 	va_end(ap);
253 	return ret;
254 }
255 
256 int
257 mprintf(const char *fmt, ...)
258 {
259 	va_list	 ap;
260 	int	 ret;
261 
262 	va_start(ap, fmt);
263 	ret = vfmprintf(stdout, fmt, ap);
264 	va_end(ap);
265 	return ret;
266 }
267