xref: /openssh-portable/utf8.c (revision cd9e1eab)
1 /* $OpenBSD: utf8.c,v 1.3 2016/05/30 12:57:21 schwarze Exp $ */
2 /*
3  * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 /*
19  * Utility functions for multibyte-character handling,
20  * in particular to sanitize untrusted strings for terminal output.
21  */
22 
23 #include <sys/types.h>
24 #include <langinfo.h>
25 #include <limits.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <vis.h>
31 #include <wchar.h>
32 
33 #include "utf8.h"
34 
35 static int	 dangerous_locale(void);
36 static int	 grow_dst(char **, size_t *, size_t, char **, size_t);
37 static int	 vasnmprintf(char **, size_t, int *, const char *, va_list);
38 
39 
40 /*
41  * For US-ASCII and UTF-8 encodings, we can safely recover from
42  * encoding errors and from non-printable characters.  For any
43  * other encodings, err to the side of caution and abort parsing:
44  * For state-dependent encodings, recovery is impossible.
45  * For arbitrary encodings, replacement of non-printable
46  * characters would be non-trivial and too fragile.
47  */
48 
49 static int
50 dangerous_locale(void) {
51 	char	*loc;
52 
53 	loc = nl_langinfo(CODESET);
54 	return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
55 }
56 
57 static int
58 grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
59 {
60 	char	*tp;
61 	size_t	 tsz;
62 
63 	if (*dp + need < *dst + *sz)
64 		return 0;
65 	tsz = *sz + 128;
66 	if (tsz > maxsz)
67 		tsz = maxsz;
68 	if ((tp = realloc(*dst, tsz)) == NULL)
69 		return -1;
70 	*dp = tp + (*dp - *dst);
71 	*dst = tp;
72 	*sz = tsz;
73 	return 0;
74 }
75 
76 /*
77  * The following two functions limit the number of bytes written,
78  * including the terminating '\0', to sz.  Unless wp is NULL,
79  * they limit the number of display columns occupied to *wp.
80  * Whichever is reached first terminates the output string.
81  * To stay close to the standard interfaces, they return the number of
82  * non-NUL bytes that would have been written if both were unlimited.
83  * If wp is NULL, newline, carriage return, and tab are allowed;
84  * otherwise, the actual number of columns occupied by what was
85  * written is returned in *wp.
86  */
87 
88 static int
89 vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
90 {
91 	char	*src;	/* Source string returned from vasprintf. */
92 	char	*sp;	/* Pointer into src. */
93 	char	*dst;	/* Destination string to be returned. */
94 	char	*dp;	/* Pointer into dst. */
95 	char	*tp;	/* Temporary pointer for dst. */
96 	size_t	 sz;	/* Number of bytes allocated for dst. */
97 	wchar_t	 wc;	/* Wide character at sp. */
98 	int	 len;	/* Number of bytes in the character at sp. */
99 	int	 ret;	/* Number of bytes needed to format src. */
100 	int	 width;	/* Display width of the character wc. */
101 	int	 total_width, max_width, print;
102 
103 	src = NULL;
104 	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
105 		goto fail;
106 
107 	sz = strlen(src) + 1;
108 	if ((dst = malloc(sz)) == NULL) {
109 		free(src);
110 		goto fail;
111 	}
112 
113 	if (maxsz > INT_MAX)
114 		maxsz = INT_MAX;
115 
116 	sp = src;
117 	dp = dst;
118 	ret = 0;
119 	print = 1;
120 	total_width = 0;
121 	max_width = wp == NULL ? INT_MAX : *wp;
122 	while (*sp != '\0') {
123 		if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
124 			(void)mbtowc(NULL, NULL, MB_CUR_MAX);
125 			if (dangerous_locale()) {
126 				ret = -1;
127 				break;
128 			}
129 			len = 1;
130 			width = -1;
131 		} else if (wp == NULL &&
132 		    (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
133 			/*
134 			 * Don't use width uninitialized; the actual
135 			 * value doesn't matter because total_width
136 			 * is only returned for wp != NULL.
137 			 */
138 			width = 0;
139 		} else if ((width = wcwidth(wc)) == -1 &&
140 		    dangerous_locale()) {
141 			ret = -1;
142 			break;
143 		}
144 
145 		/* Valid, printable character. */
146 
147 		if (width >= 0) {
148 			if (print && (dp - dst >= (int)maxsz - len ||
149 			    total_width > max_width - width))
150 				print = 0;
151 			if (print) {
152 				if (grow_dst(&dst, &sz, maxsz,
153 				    &dp, len) == -1) {
154 					ret = -1;
155 					break;
156 				}
157 				total_width += width;
158 				memcpy(dp, sp, len);
159 				dp += len;
160 			}
161 			sp += len;
162 			if (ret >= 0)
163 				ret += len;
164 			continue;
165 		}
166 
167 		/* Escaping required. */
168 
169 		while (len > 0) {
170 			if (print && (dp - dst >= (int)maxsz - 4 ||
171 			    total_width > max_width - 4))
172 				print = 0;
173 			if (print) {
174 				if (grow_dst(&dst, &sz, maxsz,
175 				    &dp, 4) == -1) {
176 					ret = -1;
177 					break;
178 				}
179 				tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
180 				width = tp - dp;
181 				total_width += width;
182 				dp = tp;
183 			} else
184 				width = 4;
185 			len--;
186 			sp++;
187 			if (ret >= 0)
188 				ret += width;
189 		}
190 		if (len > 0)
191 			break;
192 	}
193 	free(src);
194 	*dp = '\0';
195 	*str = dst;
196 	if (wp != NULL)
197 		*wp = total_width;
198 
199 	/*
200 	 * If the string was truncated by the width limit but
201 	 * would have fit into the size limit, the only sane way
202 	 * to report the problem is using the return value, such
203 	 * that the usual idiom "if (ret < 0 || ret >= sz) error"
204 	 * works as expected.
205 	 */
206 
207 	if (ret < (int)maxsz && !print)
208 		ret = -1;
209 	return ret;
210 
211 fail:
212 	if (wp != NULL)
213 		*wp = 0;
214 	if (ret == 0) {
215 		*str = src;
216 		return 0;
217 	} else {
218 		*str = NULL;
219 		return -1;
220 	}
221 }
222 
223 int
224 snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
225 {
226 	va_list	 ap;
227 	char	*cp;
228 	int	 ret;
229 
230 	va_start(ap, fmt);
231 	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
232 	va_end(ap);
233 	if (cp != NULL) {
234 		(void)strlcpy(str, cp, sz);
235 		free(cp);
236 	} else
237 		*str = '\0';
238 	return ret;
239 }
240 
241 /*
242  * To stay close to the standard interfaces, the following functions
243  * return the number of non-NUL bytes written.
244  */
245 
246 int
247 vfmprintf(FILE *stream, const char *fmt, va_list ap)
248 {
249 	char	*str;
250 	int	 ret;
251 
252 	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
253 		return -1;
254 	if (fputs(str, stream) == EOF)
255 		ret = -1;
256 	free(str);
257 	return ret;
258 }
259 
260 int
261 fmprintf(FILE *stream, const char *fmt, ...)
262 {
263 	va_list	 ap;
264 	int	 ret;
265 
266 	va_start(ap, fmt);
267 	ret = vfmprintf(stream, fmt, ap);
268 	va_end(ap);
269 	return ret;
270 }
271 
272 int
273 mprintf(const char *fmt, ...)
274 {
275 	va_list	 ap;
276 	int	 ret;
277 
278 	va_start(ap, fmt);
279 	ret = vfmprintf(stdout, fmt, ap);
280 	va_end(ap);
281 	return ret;
282 }
283