xref: /openssh-portable/utf8.c (revision 0e059cdf)
1 /* $OpenBSD: utf8.c,v 1.1 2016/05/25 23:48:45 schwarze Exp $ */
2 /*
3  * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 /*
19  * Utility functions for multibyte-character handling,
20  * in particular to sanitize untrusted strings for terminal output.
21  */
22 
23 #include <sys/types.h>
24 #include <langinfo.h>
25 #include <limits.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <vis.h>
31 #include <wchar.h>
32 
33 #include "utf8.h"
34 
35 static int	 dangerous_locale(void);
36 static int	 vasnmprintf(char **, size_t, int *, const char *, va_list);
37 
38 
39 /*
40  * For US-ASCII and UTF-8 encodings, we can safely recover from
41  * encoding errors and from non-printable characters.  For any
42  * other encodings, err to the side of caution and abort parsing:
43  * For state-dependent encodings, recovery is impossible.
44  * For arbitrary encodings, replacement of non-printable
45  * characters would be non-trivial and too fragile.
46  */
47 
48 static int
49 dangerous_locale(void) {
50 	char	*loc;
51 
52 	loc = nl_langinfo(CODESET);
53 	return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
54 }
55 
56 /*
57  * The following two functions limit the number of bytes written,
58  * including the terminating '\0', to sz.  Unless wp is NULL,
59  * they limit the number of display columns occupied to *wp.
60  * Whichever is reached first terminates the output string.
61  * To stay close to the standard interfaces, they return the number of
62  * non-NUL bytes that would have been written if both were unlimited.
63  * If wp is NULL, newline, carriage return, and tab are allowed;
64  * otherwise, the actual number of columns occupied by what was
65  * written is returned in *wp.
66  */
67 
68 static int
69 vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
70 {
71 	char	*src;	/* Source string returned from vasprintf. */
72 	char	*sp;	/* Pointer into src. */
73 	char	*dst;	/* Destination string to be returned. */
74 	char	*dp;	/* Pointer into dst. */
75 	char	*tp;	/* Temporary pointer for dst. */
76 	size_t	 sz;	/* Number of bytes allocated for dst. */
77 	size_t	 tsz;	/* Temporary size while extending dst. */
78 	wchar_t	 wc;	/* Wide character at sp. */
79 	int	 len;	/* Number of bytes in the character at sp. */
80 	int	 ret;	/* Number of bytes needed to format src. */
81 	int	 width;	/* Display width of the character wc. */
82 	int	 total_width, max_width, print;
83 
84 	src = dst = NULL;
85 	if (vasprintf(&src, fmt, ap) <= 0)
86 		goto fail;
87 
88 	sz = strlen(src);
89 	if ((dst = malloc(sz)) == NULL)
90 		goto fail;
91 
92 	if (maxsz > INT_MAX)
93 		maxsz = INT_MAX;
94 
95 	sp = src;
96 	dp = dst;
97 	ret = 0;
98 	print = 1;
99 	total_width = 0;
100 	max_width = wp == NULL ? INT_MAX : *wp;
101 	while (*sp != '\0') {
102 		if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
103 			(void)mbtowc(NULL, NULL, MB_CUR_MAX);
104 			if (dangerous_locale()) {
105 				ret = -1;
106 				break;
107 			}
108 			len = 1;
109 			width = -1;
110 		} else if (wp == NULL &&
111 		    (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
112 			/*
113 			 * Don't use width uninitialized; the actual
114 			 * value doesn't matter because total_width
115 			 * is only returned for wp != NULL.
116 			 */
117 			width = 0;
118 		} else if ((width = wcwidth(wc)) == -1 &&
119 		    dangerous_locale()) {
120 			ret = -1;
121 			break;
122 		}
123 
124 		/* Valid, printable character. */
125 
126 		if (width >= 0) {
127 			if (print && (dp - dst >= (int)maxsz - len ||
128 			    total_width > max_width - width))
129 				print = 0;
130 			if (print) {
131 				total_width += width;
132 				memcpy(dp, sp, len);
133 				dp += len;
134 			}
135 			sp += len;
136 			if (ret >= 0)
137 				ret += len;
138 			continue;
139 		}
140 
141 		/* Escaping required. */
142 
143 		while (len > 0) {
144 			if (print && (dp - dst >= (int)maxsz - 4 ||
145 			    total_width > max_width - 4))
146 				print = 0;
147 			if (print) {
148 				if (dp + 4 >= dst + sz) {
149 					tsz = sz + 128;
150 					if (tsz > maxsz)
151 						tsz = maxsz;
152 					tp = realloc(dst, tsz);
153 					if (tp == NULL) {
154 						ret = -1;
155 						break;
156 					}
157 					dp = tp + (dp - dst);
158 					dst = tp;
159 					sz = tsz;
160 				}
161 				tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
162 				width = tp - dp;
163 				total_width += width;
164 				dp = tp;
165 			} else
166 				width = 4;
167 			len--;
168 			sp++;
169 			if (ret >= 0)
170 				ret += width;
171 		}
172 		if (len > 0)
173 			break;
174 	}
175 	free(src);
176 	*dp = '\0';
177 	*str = dst;
178 	if (wp != NULL)
179 		*wp = total_width;
180 
181 	/*
182 	 * If the string was truncated by the width limit but
183 	 * would have fit into the size limit, the only sane way
184 	 * to report the problem is using the return value, such
185 	 * that the usual idiom "if (ret < 0 || ret >= sz) error"
186 	 * works as expected.
187 	 */
188 
189 	if (ret < (int)maxsz && !print)
190 		ret = -1;
191 	return ret;
192 
193 fail:
194 	free(src);
195 	free(dst);
196 	*str = NULL;
197 	if (wp != NULL)
198 		*wp = 0;
199 	return -1;
200 }
201 
202 int
203 snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
204 {
205 	va_list	 ap;
206 	char	*cp;
207 	int	 ret;
208 
209 	va_start(ap, fmt);
210 	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
211 	va_end(ap);
212 	(void)strlcpy(str, cp, sz);
213 	free(cp);
214 	return ret;
215 }
216 
217 /*
218  * To stay close to the standard interfaces, the following functions
219  * return the number of non-NUL bytes written.
220  */
221 
222 int
223 vfmprintf(FILE *stream, const char *fmt, va_list ap)
224 {
225 	char	*str;
226 	int	 ret;
227 
228 	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
229 		return -1;
230 	if (fputs(str, stream) == EOF)
231 		ret = -1;
232 	free(str);
233 	return ret;
234 }
235 
236 int
237 fmprintf(FILE *stream, const char *fmt, ...)
238 {
239 	va_list	 ap;
240 	int	 ret;
241 
242 	va_start(ap, fmt);
243 	ret = vfmprintf(stream, fmt, ap);
244 	va_end(ap);
245 	return ret;
246 }
247 
248 int
249 mprintf(const char *fmt, ...)
250 {
251 	va_list	 ap;
252 	int	 ret;
253 
254 	va_start(ap, fmt);
255 	ret = vfmprintf(stdout, fmt, ap);
256 	va_end(ap);
257 	return ret;
258 }
259