1 /** @file
2 
3   Entry point to the traffic manager.
4 
5   @section license License
6 
7   Licensed to the Apache Software Foundation (ASF) under one
8   or more contributor license agreements.  See the NOTICE file
9   distributed with this work for additional information
10   regarding copyright ownership.  The ASF licenses this file
11   to you under the Apache License, Version 2.0 (the
12   "License"); you may not use this file except in compliance
13   with the License.  You may obtain a copy of the License at
14 
15       http://www.apache.org/licenses/LICENSE-2.0
16 
17   Unless required by applicable law or agreed to in writing, software
18   distributed under the License is distributed on an "AS IS" BASIS,
19   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20   See the License for the specific language governing permissions and
21   limitations under the License.
22  */
23 
24 #include "tscore/ink_sys_control.h"
25 #include "tscore/ink_cap.h"
26 #include "tscore/ink_lockfile.h"
27 #include "tscore/ink_sock.h"
28 #include "tscore/ink_args.h"
29 #include "tscore/ink_syslog.h"
30 #include "tscore/runroot.h"
31 
32 #include "WebMgmtUtils.h"
33 #include "MgmtUtils.h"
34 #include "MgmtSocket.h"
35 #include "NetworkUtilsRemote.h"
36 #include "FileManager.h"
37 #include "tscore/I_Layout.h"
38 #include "tscore/I_Version.h"
39 #include "tscore/TextBuffer.h"
40 #include "DiagsConfig.h"
41 #include "HTTP.h"
42 #include "CoreAPI.h"
43 
44 #include "LocalManager.h"
45 #include "TSControlMain.h"
46 #include "EventControlMain.h"
47 
48 // Needs LibRecordsConfigInit()
49 #include "RecordsConfig.h"
50 
51 #include "records/P_RecLocal.h"
52 #include "DerivativeMetrics.h"
53 
54 #if TS_USE_POSIX_CAP
55 #include <sys/capability.h>
56 #endif
57 #include <grp.h>
58 #include <atomic>
59 #include "tscore/bwf_std_format.h"
60 
61 #define FD_THROTTLE_HEADROOM (128 + 64) // TODO: consolidate with THROTTLE_FD_HEADROOM
62 #define DIAGS_LOG_FILENAME "manager.log"
63 
64 #if ATOMIC_INT_LOCK_FREE != 2
65 #error "Need lock free std::atomic<int>"
66 #endif
67 
68 using namespace std::literals;
69 
70 // These globals are still referenced directly by management API.
71 LocalManager *lmgmt = nullptr;
72 FileManager *configFiles;
73 
74 static void fileUpdated(char *fname, char *configName);
75 static void runAsUser(const char *userName);
76 
77 #if defined(freebsd)
78 extern "C" int getpwnam_r(const char *name, struct passwd *result, char *buffer, size_t buflen, struct passwd **resptr);
79 #endif
80 
81 static AppVersionInfo appVersionInfo; // Build info for this application
82 
83 static inkcoreapi DiagsConfig *diagsConfig;
84 static char debug_tags[1024]  = "";
85 static char action_tags[1024] = "";
86 static int proxy_off          = false;
87 static int listen_off         = false;
88 static char bind_stdout[512]  = "";
89 static char bind_stderr[512]  = "";
90 static const char *mgmt_path  = nullptr;
91 
92 // By default, set the current directory as base
93 static const char *recs_conf = "records.config";
94 
95 static int fds_limit;
96 
97 // TODO: Use positive instead negative selection
98 //       This should just be #if defined(solaris)
99 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
100 static void SignalHandler(int sig, siginfo_t *t, void *f);
101 static void SignalAlrmHandler(int sig, siginfo_t *t, void *f);
102 #else
103 static void SignalHandler(int sig);
104 static void SignalAlrmHandler(int sig);
105 #endif
106 
107 static std::atomic<int> sigHupNotifier;
108 static void SigChldHandler(int sig);
109 
110 static void
111 rotateLogs()
112 {
113   // First, let us synchronously update the rolling config values for both diagslog
114   // and outputlog. Note that the config values for outputlog in traffic_server
115   // are never updated past the original instantiation of Diags. This shouldn't
116   // be an issue since we're never rolling outputlog from traffic_server anyways.
117   // The reason being is that it is difficult to send a notification from TS to
118   // TM, informing TM that outputlog has been rolled. It is much easier sending
119   // a notification (in the form of SIGUSR2) from TM -> TS.
120   int output_log_roll_int    = (int)REC_ConfigReadInteger("proxy.config.output.logfile.rolling_interval_sec");
121   int output_log_roll_size   = (int)REC_ConfigReadInteger("proxy.config.output.logfile.rolling_size_mb");
122   int output_log_roll_enable = (int)REC_ConfigReadInteger("proxy.config.output.logfile.rolling_enabled");
123   int diags_log_roll_int     = (int)REC_ConfigReadInteger("proxy.config.diags.logfile.rolling_interval_sec");
124   int diags_log_roll_size    = (int)REC_ConfigReadInteger("proxy.config.diags.logfile.rolling_size_mb");
125   int diags_log_roll_enable  = (int)REC_ConfigReadInteger("proxy.config.diags.logfile.rolling_enabled");
126   diags->config_roll_diagslog((RollingEnabledValues)diags_log_roll_enable, diags_log_roll_int, diags_log_roll_size);
127   diags->config_roll_outputlog((RollingEnabledValues)output_log_roll_enable, output_log_roll_int, output_log_roll_size);
128 
129   // Now we can actually roll the logs (if necessary)
130   if (diags->should_roll_diagslog()) {
131     mgmt_log("Rotated %s", DIAGS_LOG_FILENAME);
132   }
133 
134   if (diags->should_roll_outputlog()) {
135     // send a signal to TS to reload traffic.out, so the logfile is kept
136     // synced across processes
137     mgmt_log("Sending SIGUSR2 to TS");
138     pid_t tspid = lmgmt->watched_process_pid;
139     if (tspid <= 0) {
140       return;
141     }
142     if (kill(tspid, SIGUSR2) != 0) {
143       mgmt_log("Could not send SIGUSR2 to TS: %s", strerror(errno));
144     } else {
145       mgmt_log("Successfully sent SIGUSR2 to TS!");
146     }
147   }
148 }
149 
150 static bool
151 is_server_idle()
152 {
153   RecInt active    = 0;
154   RecInt threshold = 0;
155 
156   if (RecGetRecordInt("proxy.config.restart.active_client_threshold", &threshold) != REC_ERR_OKAY) {
157     return false;
158   }
159 
160   if (RecGetRecordInt("proxy.process.http.current_active_client_connections", &active) != REC_ERR_OKAY) {
161     return false;
162   }
163 
164   Debug("lm", "%" PRId64 " active clients, threshold is %" PRId64, active, threshold);
165   return active <= threshold;
166 }
167 
168 static bool
169 is_server_idle_from_new_connection()
170 {
171   RecInt active    = 0;
172   RecInt threshold = 0;
173   // TODO implement with the right metric
174 
175   Debug("lm", "%" PRId64 " active clients, threshold is %" PRId64, active, threshold);
176 
177   return active <= threshold;
178 }
179 
180 static bool
181 is_server_draining()
182 {
183   RecInt draining = 0;
184   if (RecGetRecordInt("proxy.node.config.draining", &draining) != REC_ERR_OKAY) {
185     return false;
186   }
187   return draining != 0;
188 }
189 
190 static bool
191 waited_enough()
192 {
193   RecInt timeout = 0;
194   if (RecGetRecordInt("proxy.config.stop.shutdown_timeout", &timeout) != REC_ERR_OKAY) {
195     return false;
196   }
197 
198   return (lmgmt->mgmt_shutdown_triggered_at + timeout >= time(nullptr));
199 }
200 
201 static void
202 check_lockfile()
203 {
204   std::string rundir(RecConfigReadRuntimeDir());
205   char lockfile[PATH_NAME_MAX];
206   int err;
207   pid_t holding_pid;
208 
209   //////////////////////////////////////
210   // test for presence of server lock //
211   //////////////////////////////////////
212   Layout::relative_to(lockfile, sizeof(lockfile), rundir, SERVER_LOCK);
213   Lockfile server_lockfile(lockfile);
214   err = server_lockfile.Open(&holding_pid);
215   if (err == 1) {
216     server_lockfile.Close(); // no server running
217   } else {
218     char *reason = strerror(-err);
219     if (err == 0) {
220       fprintf(stderr, "FATAL: Lockfile '%s' says server already running as PID %ld\n", lockfile, static_cast<long>(holding_pid));
221       mgmt_log("FATAL: Lockfile '%s' says server already running as PID %d\n", lockfile, holding_pid);
222     } else {
223       fprintf(stderr, "FATAL: Can't open server lockfile '%s' (%s)\n", lockfile, (reason ? reason : "Unknown Reason"));
224       mgmt_log("FATAL: Can't open server lockfile '%s' (%s)\n", lockfile, (reason ? reason : "Unknown Reason"));
225     }
226     exit(1);
227   }
228 
229   ///////////////////////////////////////////
230   // try to get the exclusive manager lock //
231   ///////////////////////////////////////////
232   Layout::relative_to(lockfile, sizeof(lockfile), rundir, MANAGER_LOCK);
233   Lockfile manager_lockfile(lockfile);
234   err = manager_lockfile.Get(&holding_pid);
235   if (err != 1) {
236     char *reason = strerror(-err);
237     fprintf(stderr, "FATAL: Can't acquire manager lockfile '%s'", lockfile);
238     mgmt_log("FATAL: Can't acquire manager lockfile '%s'", lockfile);
239     if (err == 0) {
240       fprintf(stderr, " (Lock file held by process ID %ld)\n", static_cast<long>(holding_pid));
241       mgmt_log(" (Lock file held by process ID %d)\n", holding_pid);
242     } else if (reason) {
243       fprintf(stderr, " (%s)\n", reason);
244       mgmt_log(" (%s)\n", reason);
245     } else {
246       fprintf(stderr, "\n");
247     }
248     exit(1);
249 
250     fprintf(stderr, "unable to acquire manager lock [%d]\n", -err);
251     exit(1);
252   }
253 }
254 
255 static void
256 initSignalHandlers()
257 {
258   struct sigaction sigHandler, sigChldHandler, sigAlrmHandler;
259   sigset_t sigsToBlock;
260 
261 // Set up the signal handler
262 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
263   sigHandler.sa_handler   = nullptr;
264   sigHandler.sa_sigaction = SignalHandler;
265 #else
266   sigHandler.sa_handler     = SignalHandler;
267 #endif
268   sigemptyset(&sigHandler.sa_mask);
269 
270   // We want the handler to remain in place on
271   //  SIGHUP to avoid any races with the signals
272   //  coming too quickly.  Also restart systems calls
273   //  after the signal since not all calls are wrapped
274   //  to check errno for EINTR
275   sigHandler.sa_flags = SA_RESTART;
276   sigaction(SIGHUP, &sigHandler, nullptr);
277   sigaction(SIGUSR2, &sigHandler, nullptr);
278 
279 // Don't block the signal on entry to the signal
280 //   handler so we can reissue it and get a core
281 //   file in the appropriate circumstances
282 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
283   sigHandler.sa_flags = SA_RESETHAND | SA_SIGINFO;
284 #else
285   sigHandler.sa_flags       = SA_RESETHAND;
286 #endif
287   sigaction(SIGINT, &sigHandler, nullptr);
288   sigaction(SIGQUIT, &sigHandler, nullptr);
289   sigaction(SIGILL, &sigHandler, nullptr);
290   sigaction(SIGBUS, &sigHandler, nullptr);
291   sigaction(SIGSEGV, &sigHandler, nullptr);
292   sigaction(SIGTERM, &sigHandler, nullptr);
293 
294 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
295   sigAlrmHandler.sa_handler   = nullptr;
296   sigAlrmHandler.sa_sigaction = SignalAlrmHandler;
297 #else
298   sigAlrmHandler.sa_handler = SignalAlrmHandler;
299 #endif
300 
301   sigemptyset(&sigAlrmHandler.sa_mask);
302 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
303   sigAlrmHandler.sa_flags = SA_SIGINFO;
304 #else
305   sigAlrmHandler.sa_flags   = 0;
306 #endif
307   sigaction(SIGALRM, &sigAlrmHandler, nullptr);
308 
309   // Block the delivery of any signals we are not catching
310   //
311   //  except for SIGALRM since we use it
312   //    to break out of deadlock on semaphore
313   //    we share with the proxy
314   //
315   sigfillset(&sigsToBlock);
316   sigdelset(&sigsToBlock, SIGHUP);
317   sigdelset(&sigsToBlock, SIGUSR2);
318   sigdelset(&sigsToBlock, SIGINT);
319   sigdelset(&sigsToBlock, SIGQUIT);
320   sigdelset(&sigsToBlock, SIGILL);
321   sigdelset(&sigsToBlock, SIGABRT);
322   sigdelset(&sigsToBlock, SIGBUS);
323   sigdelset(&sigsToBlock, SIGSEGV);
324   sigdelset(&sigsToBlock, SIGTERM);
325   sigdelset(&sigsToBlock, SIGALRM);
326   ink_thread_sigsetmask(SIG_SETMASK, &sigsToBlock, nullptr);
327 
328   // Set up the SIGCHLD handler so we do not get into
329   //   a problem with Solaris 2.6 and strange waitpid()
330   //   behavior
331   sigChldHandler.sa_handler = SigChldHandler;
332   sigChldHandler.sa_flags   = SA_RESTART;
333   sigemptyset(&sigChldHandler.sa_mask);
334   sigaction(SIGCHLD, &sigChldHandler, nullptr);
335 }
336 
337 static void
338 init_dirs()
339 {
340   std::string rundir(RecConfigReadRuntimeDir());
341   std::string sysconfdir(RecConfigReadConfigDir());
342 
343   if (access(sysconfdir.c_str(), R_OK) == -1) {
344     mgmt_elog(0, "unable to access() config directory '%s': %d, %s\n", sysconfdir.c_str(), errno, strerror(errno));
345     mgmt_elog(0, "please set the 'TS_ROOT' environment variable\n");
346     ::exit(1);
347   }
348 
349   if (access(rundir.c_str(), R_OK) == -1) {
350     mgmt_elog(0, "unable to access() local state directory '%s': %d, %s\n", rundir.c_str(), errno, strerror(errno));
351     mgmt_elog(0, "please set 'proxy.config.local_state_dir'\n");
352     ::exit(1);
353   }
354 }
355 
356 static void
357 chdir_root()
358 {
359   std::string prefix = Layout::get()->prefix;
360 
361   if (chdir(prefix.c_str()) < 0) {
362     mgmt_elog(0, "unable to change to root directory \"%s\" [%d '%s']\n", prefix.c_str(), errno, strerror(errno));
363     mgmt_elog(0, " please set correct path in env variable TS_ROOT \n");
364     exit(1);
365   } else {
366     mgmt_log("[TrafficManager] using root directory '%s'\n", prefix.c_str());
367   }
368 }
369 
370 static void
371 set_process_limits(RecInt fds_throttle)
372 {
373   struct rlimit lim;
374   rlim_t maxfiles;
375 
376   // Set needed rlimits (root)
377   ink_max_out_rlimit(RLIMIT_NOFILE);
378   ink_max_out_rlimit(RLIMIT_STACK);
379   ink_max_out_rlimit(RLIMIT_DATA);
380   ink_max_out_rlimit(RLIMIT_FSIZE);
381 #ifdef RLIMIT_RSS
382   ink_max_out_rlimit(RLIMIT_RSS);
383 #endif
384 
385   maxfiles = ink_get_max_files();
386   if (maxfiles != RLIM_INFINITY) {
387     float file_max_pct = 0.9;
388 
389     REC_ReadConfigFloat(file_max_pct, "proxy.config.system.file_max_pct");
390     if (file_max_pct > 1.0) {
391       file_max_pct = 1.0;
392     }
393 
394     lim.rlim_cur = lim.rlim_max = static_cast<rlim_t>(maxfiles * file_max_pct);
395     if (setrlimit(RLIMIT_NOFILE, &lim) == 0 && getrlimit(RLIMIT_NOFILE, &lim) == 0) {
396       fds_limit = static_cast<int>(lim.rlim_cur);
397       syslog(LOG_NOTICE, "NOTE: RLIMIT_NOFILE(%d):cur(%d),max(%d)", RLIMIT_NOFILE, static_cast<int>(lim.rlim_cur),
398              static_cast<int>(lim.rlim_max));
399     }
400   }
401 
402   if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
403     if (fds_throttle > (int)(lim.rlim_cur + FD_THROTTLE_HEADROOM)) {
404       lim.rlim_cur = (lim.rlim_max = (rlim_t)fds_throttle);
405       if (!setrlimit(RLIMIT_NOFILE, &lim) && !getrlimit(RLIMIT_NOFILE, &lim)) {
406         fds_limit = static_cast<int>(lim.rlim_cur);
407         syslog(LOG_NOTICE, "NOTE: RLIMIT_NOFILE(%d):cur(%d),max(%d)", RLIMIT_NOFILE, static_cast<int>(lim.rlim_cur),
408                static_cast<int>(lim.rlim_max));
409       }
410     }
411   }
412 }
413 
414 #if TS_HAS_WCCP
415 static void
416 Errata_Logger(ts::Errata const &err)
417 {
418   size_t n;
419   static size_t const SIZE = 4096;
420   char buff[SIZE];
421   if (err.size()) {
422     ts::Errata::Code code = err.top().getCode();
423     n                     = err.write(buff, SIZE, 1, 0, 2, "> ");
424     // strip trailing newlines.
425     while (n && (buff[n - 1] == '\n' || buff[n - 1] == '\r'))
426       buff[--n] = 0;
427     // log it.
428     if (code > 1)
429       mgmt_elog(0, "[WCCP]%s", buff);
430     else if (code > 0)
431       mgmt_log("[WCCP]%s", buff);
432     else
433       Debug("WCCP", "%s", buff);
434   }
435 }
436 
437 static void
438 Init_Errata_Logging()
439 {
440   ts::Errata::registerSink(&Errata_Logger);
441 }
442 #endif
443 
444 static void
445 millisleep(int ms)
446 {
447   struct timespec ts;
448 
449   ts.tv_sec  = ms / 1000;
450   ts.tv_nsec = (ms - ts.tv_sec * 1000) * 1000 * 1000;
451   nanosleep(&ts, nullptr); // we use nanosleep instead of sleep because it does not interact with signals
452 }
453 
454 bool
455 api_socket_is_restricted()
456 {
457   RecInt intval;
458 
459   // If the socket is not administratively restricted, check whether we have platform
460   // support. Otherwise, default to making it restricted.
461   if (RecGetRecordInt("proxy.config.admin.api.restricted", &intval) == REC_ERR_OKAY) {
462     if (intval == 0) {
463       return !mgmt_has_peereid();
464     }
465   }
466 
467   return true;
468 }
469 
470 int
471 main(int argc, const char **argv)
472 {
473   const long MAX_LOGIN = ink_login_name_max();
474 
475   runroot_handler(argv);
476 
477   // Before accessing file system initialize Layout engine
478   Layout::create();
479   mgmt_path = Layout::get()->sysconfdir.c_str();
480 
481   // Set up the application version info
482   appVersionInfo.setup(PACKAGE_NAME, "traffic_manager", PACKAGE_VERSION, __DATE__, __TIME__, BUILD_MACHINE, BUILD_PERSON, "");
483 
484   bool found       = false;
485   int just_started = 0;
486   // TODO: This seems completely incomplete, disabled for now
487   //  int dump_config = 0, dump_process = 0, dump_node = 0, dump_local = 0;
488   char *proxy_port   = nullptr;
489   char *tsArgs       = nullptr;
490   int disable_syslog = false;
491   char userToRunAs[MAX_LOGIN + 1];
492   RecInt fds_throttle = -1;
493 
494   ArgumentDescription argument_descriptions[] = {
495     {"proxyOff", '-', "Disable proxy", "F", &proxy_off, nullptr, nullptr},
496     {"listenOff", '-', "Disable traffic manager listen to proxy ports", "F", &listen_off, nullptr, nullptr},
497     {"path", '-', "Path to the management socket", "S*", &mgmt_path, nullptr, nullptr},
498     {"recordsConf", '-', "Path to records.config", "S*", &recs_conf, nullptr, nullptr},
499     {"tsArgs", '-', "Additional arguments for traffic_server", "S*", &tsArgs, nullptr, nullptr},
500     {"proxyPort", '-', "HTTP port descriptor", "S*", &proxy_port, nullptr, nullptr},
501     {"maxRecords", 'm', "Max number of librecords metrics and configurations (default & minimum: 1600)", "I", &max_records_entries,
502      "PROXY_MAX_RECORDS", nullptr},
503     {TM_OPT_BIND_STDOUT, '-', "Regular file to bind stdout to", "S512", &bind_stdout, "PROXY_BIND_STDOUT", nullptr},
504     {TM_OPT_BIND_STDERR, '-', "Regular file to bind stderr to", "S512", &bind_stderr, "PROXY_BIND_STDERR", nullptr},
505 #if TS_USE_DIAGS
506     {"debug", 'T', "Vertical-bar-separated Debug Tags", "S1023", debug_tags, nullptr, nullptr},
507     {"action", 'B', "Vertical-bar-separated Behavior Tags", "S1023", action_tags, nullptr, nullptr},
508 #endif
509     {"nosyslog", '-', "Do not log to syslog", "F", &disable_syslog, nullptr, nullptr},
510     HELP_ARGUMENT_DESCRIPTION(),
511     VERSION_ARGUMENT_DESCRIPTION(),
512     RUNROOT_ARGUMENT_DESCRIPTION()
513   };
514 
515   // Process command line arguments and dump into variables
516   process_args(&appVersionInfo, argument_descriptions, countof(argument_descriptions), argv);
517 
518   // change the directory to the "root" directory
519   chdir_root();
520 
521   // Line buffer standard output & standard error
522   int status;
523   status = setvbuf(stdout, nullptr, _IOLBF, 0);
524   if (status != 0) {
525     perror("WARNING: can't line buffer stdout");
526   }
527   status = setvbuf(stderr, nullptr, _IOLBF, 0);
528   if (status != 0) {
529     perror("WARNING: can't line buffer stderr");
530   }
531 
532   initSignalHandlers();
533 
534   // Bootstrap with LOG_DAEMON until we've read our configuration
535   if (!disable_syslog) {
536     openlog("traffic_manager", LOG_PID | LOG_NDELAY | LOG_NOWAIT, LOG_DAEMON);
537     mgmt_use_syslog();
538     syslog(LOG_NOTICE, "NOTE: --- Manager Starting ---");
539     syslog(LOG_NOTICE, "NOTE: Manager Version: %s", appVersionInfo.FullVersionInfoStr);
540   }
541 
542   // Bootstrap the Diags facility so that we can use it while starting
543   //  up the manager
544   diagsConfig = new DiagsConfig("Manager", DIAGS_LOG_FILENAME, debug_tags, action_tags, false);
545   diags       = diagsConfig->diags;
546   diags->set_std_output(StdStream::STDOUT, bind_stdout);
547   diags->set_std_output(StdStream::STDERR, bind_stderr);
548 
549   RecLocalInit();
550   LibRecordsConfigInit();
551 
552   init_dirs(); // setup critical directories, needs LibRecords
553 
554   if (RecGetRecordString("proxy.config.admin.user_id", userToRunAs, sizeof(userToRunAs)) != REC_ERR_OKAY ||
555       strlen(userToRunAs) == 0) {
556     mgmt_fatal(0, "proxy.config.admin.user_id is not set\n");
557   }
558 
559   RecGetRecordInt("proxy.config.net.connections_throttle", &fds_throttle);
560 
561   set_process_limits(fds_throttle); // as root
562 
563   // A user of #-1 means to not attempt to switch user. Yes, it's documented ;)
564   if (strcmp(userToRunAs, "#-1") != 0) {
565     runAsUser(userToRunAs);
566   }
567 
568   EnableCoreFile(true);
569   check_lockfile();
570 
571   url_init();
572   mime_init();
573   http_init();
574 
575 #if TS_HAS_WCCP
576   Init_Errata_Logging();
577 #endif
578   ts_host_res_global_init();
579   ts_session_protocol_well_known_name_indices_init();
580   lmgmt = new LocalManager(proxy_off == false, listen_off == false);
581   RecLocalInitMessage();
582   lmgmt->initAlarm();
583 
584   if (diags) {
585     delete diagsConfig;
586   }
587   // INKqa11968: need to set up callbacks and diags data structures
588   // using configuration in records.config
589   diagsConfig = new DiagsConfig("Manager", DIAGS_LOG_FILENAME, debug_tags, action_tags, true);
590   diags       = diagsConfig->diags;
591   RecSetDiags(diags);
592   diags->set_std_output(StdStream::STDOUT, bind_stdout);
593   diags->set_std_output(StdStream::STDERR, bind_stderr);
594 
595   if (is_debug_tag_set("diags")) {
596     diags->dump();
597   }
598   diags->cleanup_func = mgmt_cleanup;
599 
600   // Setup the exported manager version records.
601   RecSetRecordString("proxy.node.version.manager.short", appVersionInfo.VersionStr, REC_SOURCE_DEFAULT);
602   RecSetRecordString("proxy.node.version.manager.long", appVersionInfo.FullVersionInfoStr, REC_SOURCE_DEFAULT);
603   RecSetRecordString("proxy.node.version.manager.build_number", appVersionInfo.BldNumStr, REC_SOURCE_DEFAULT);
604   RecSetRecordString("proxy.node.version.manager.build_time", appVersionInfo.BldTimeStr, REC_SOURCE_DEFAULT);
605   RecSetRecordString("proxy.node.version.manager.build_date", appVersionInfo.BldDateStr, REC_SOURCE_DEFAULT);
606   RecSetRecordString("proxy.node.version.manager.build_machine", appVersionInfo.BldMachineStr, REC_SOURCE_DEFAULT);
607   RecSetRecordString("proxy.node.version.manager.build_person", appVersionInfo.BldPersonStr, REC_SOURCE_DEFAULT);
608 
609   if (!disable_syslog) {
610     char sys_var[]     = "proxy.config.syslog_facility";
611     char *facility_str = nullptr;
612     int facility_int;
613 
614     facility_str = REC_readString(sys_var, &found);
615     ink_assert(found);
616 
617     if (!found) {
618       mgmt_elog(0, "Could not read %s.  Defaulting to LOG_DAEMON\n", sys_var);
619       facility_int = LOG_DAEMON;
620     } else {
621       facility_int = facility_string_to_int(facility_str);
622       ats_free(facility_str);
623       if (facility_int < 0) {
624         mgmt_elog(0, "Bad syslog facility specified.  Defaulting to LOG_DAEMON\n");
625         facility_int = LOG_DAEMON;
626       }
627     }
628 
629     // NOTE: do NOT call closelog() here.  Solaris gets confused.
630     openlog("traffic_manager", LOG_PID | LOG_NDELAY | LOG_NOWAIT, facility_int);
631 
632     lmgmt->syslog_facility = facility_int;
633   } else {
634     lmgmt->syslog_facility = -1;
635   }
636 
637   // Find out our hostname so we can use it as part of the initialization
638   setHostnameVar();
639 
640   // Initialize the Config Object bindings before
641   //   starting any other threads
642   lmgmt->configFiles = configFiles = new FileManager();
643   initializeRegistry();
644   configFiles->registerCallback(fileUpdated);
645 
646   // RecLocal's 'sync_thr' depends on 'configFiles', so we can't
647   // stat the 'sync_thr' until 'configFiles' has been initialized.
648   RecLocalStart(configFiles);
649 
650   // TS needs to be started up with the same outputlog bindings each time,
651   // so we append the outputlog location to the persistent proxy options
652   //
653   // TS needs them to be able to create BaseLogFiles for each value
654   ts::bwprint(lmgmt->proxy_options, "{}{}{}", ts::bwf::OptionalAffix(tsArgs),
655               ts::bwf::OptionalAffix(bind_stdout, " "sv, "--bind_stdout "sv),
656               ts::bwf::OptionalAffix(bind_stderr, " "sv, "--bind_stderr "sv));
657 
658   if (proxy_port) {
659     HttpProxyPort::loadValue(lmgmt->m_proxy_ports, proxy_port);
660   }
661 
662   lmgmt->initMgmtProcessServer(); /* Setup p-to-p process server */
663 
664   lmgmt->listenForProxy();
665 
666   // Setup the API and event sockets
667   std::string rundir(RecConfigReadRuntimeDir());
668   std::string apisock(Layout::relative_to(rundir, MGMTAPI_MGMT_SOCKET_NAME));
669   std::string eventsock(Layout::relative_to(rundir, MGMTAPI_EVENT_SOCKET_NAME));
670 
671   Debug("lm", "using main socket file '%s'", apisock.c_str());
672   Debug("lm", "using event socket file '%s'", eventsock.c_str());
673 
674   mode_t oldmask = umask(0);
675   mode_t newmode = api_socket_is_restricted() ? 00700 : 00777;
676 
677   int mgmtapiFD  = -1; // FD for the api interface to issue commands
678   int eventapiFD = -1; // FD for the api and clients to handle event callbacks
679 
680   mgmtapiFD = bind_unix_domain_socket(apisock.c_str(), newmode);
681   if (mgmtapiFD == -1) {
682     mgmt_log("[WebIntrMain] Unable to set up socket for handling management API calls. API socket path = %s\n", apisock.c_str());
683   }
684 
685   eventapiFD = bind_unix_domain_socket(eventsock.c_str(), newmode);
686   if (eventapiFD == -1) {
687     mgmt_log("[WebIntrMain] Unable to set up so for handling management API event calls. Event Socket path: %s\n",
688              eventsock.c_str());
689   }
690 
691   umask(oldmask);
692   ink_thread_create(nullptr, ts_ctrl_main, &mgmtapiFD, 0, 0, nullptr);
693   ink_thread_create(nullptr, event_callback_main, &eventapiFD, 0, 0, nullptr);
694 
695   mgmt_log("[TrafficManager] Setup complete\n");
696 
697   RecRegisterStatInt(RECT_NODE, "proxy.node.config.reconfigure_time", time(nullptr), RECP_NON_PERSISTENT);
698   RecRegisterStatInt(RECT_NODE, "proxy.node.config.reconfigure_required", 0, RECP_NON_PERSISTENT);
699 
700   RecRegisterStatInt(RECT_NODE, "proxy.node.config.restart_required.proxy", 0, RECP_NON_PERSISTENT);
701   RecRegisterStatInt(RECT_NODE, "proxy.node.config.restart_required.manager", 0, RECP_NON_PERSISTENT);
702 
703   RecRegisterStatInt(RECT_NODE, "proxy.node.config.draining", 0, RECP_NON_PERSISTENT);
704 
705   const int MAX_SLEEP_S      = 60; // Max sleep duration
706   int sleep_time             = 0;  // sleep_time given in sec
707   uint64_t last_start_epoc_s = 0;  // latest start attempt in seconds since epoc
708 
709   DerivativeMetrics derived; // This is simple class to calculate some useful derived metrics
710 
711   for (;;) {
712     lmgmt->processEventQueue();
713     lmgmt->pollMgmtProcessServer();
714 
715     // Handle rotation of output log (aka traffic.out) as well as DIAGS_LOG_FILENAME (aka manager.log)
716     rotateLogs();
717 
718     // Check for a SIGHUP
719     if (sigHupNotifier) {
720       mgmt_log("[main] Reading Configuration Files due to SIGHUP\n");
721       Reconfigure();
722       sigHupNotifier = 0;
723       mgmt_log("[main] Reading Configuration Files Reread\n");
724     }
725 
726     // Update the derived metrics. ToDo: this runs once a second, that might be excessive, maybe it should be
727     // done more like every config_update_interval_ms (proxy.config.config_update_interval_ms) ?
728     derived.Update();
729 
730     if (lmgmt->mgmt_shutdown_outstanding != MGMT_PENDING_NONE) {
731       Debug("lm", "pending shutdown %d", lmgmt->mgmt_shutdown_outstanding);
732     }
733     switch (lmgmt->mgmt_shutdown_outstanding) {
734     case MGMT_PENDING_RESTART:
735       lmgmt->mgmtShutdown();
736       ::exit(0);
737       break;
738     case MGMT_PENDING_IDLE_RESTART:
739       if (!is_server_draining()) {
740         lmgmt->processDrain();
741       }
742       if (is_server_idle() || waited_enough()) {
743         lmgmt->mgmtShutdown();
744         ::exit(0);
745       }
746       break;
747     case MGMT_PENDING_BOUNCE:
748       lmgmt->processBounce();
749       lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
750       break;
751     case MGMT_PENDING_IDLE_BOUNCE:
752       if (!is_server_draining()) {
753         lmgmt->processDrain();
754       }
755       if (is_server_idle() || waited_enough()) {
756         lmgmt->processBounce();
757         lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
758       }
759       break;
760     case MGMT_PENDING_STOP:
761       lmgmt->processShutdown();
762       lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
763       break;
764     case MGMT_PENDING_IDLE_STOP:
765       if (!is_server_draining()) {
766         lmgmt->processDrain();
767       }
768       if (is_server_idle() || waited_enough()) {
769         lmgmt->processShutdown();
770         lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
771       }
772       break;
773     case MGMT_PENDING_DRAIN:
774       if (!is_server_draining()) {
775         lmgmt->processDrain();
776       }
777       lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
778       break;
779     case MGMT_PENDING_IDLE_DRAIN:
780       if (is_server_idle_from_new_connection()) {
781         lmgmt->processDrain();
782         lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
783       }
784       break;
785     case MGMT_PENDING_UNDO_DRAIN:
786       if (is_server_draining()) {
787         lmgmt->processDrain(0);
788         lmgmt->mgmt_shutdown_outstanding = MGMT_PENDING_NONE;
789       }
790       break;
791     default:
792       break;
793     }
794 
795     if (lmgmt->run_proxy && !lmgmt->processRunning() && lmgmt->proxy_recoverable) { /* Make sure we still have a proxy up */
796       const uint64_t now = static_cast<uint64_t>(time(nullptr));
797       if (sleep_time && ((now - last_start_epoc_s) < MAX_SLEEP_S)) {
798         mgmt_log("Relaunching proxy after %d sec...", sleep_time);
799         millisleep(1000 * sleep_time); // we use millisleep instead of sleep because it doesnt interfere with signals
800         sleep_time = std::min(sleep_time * 2, MAX_SLEEP_S);
801       } else {
802         sleep_time = 1;
803       }
804       if (ProxyStateSet(TS_PROXY_ON, TS_CACHE_CLEAR_NONE) == TS_ERR_OKAY) {
805         just_started      = 0;
806         last_start_epoc_s = static_cast<uint64_t>(time(nullptr));
807       } else {
808         just_started++;
809       }
810     } else { /* Give the proxy a chance to fire up */
811       if (!lmgmt->proxy_recoverable) {
812         mgmt_log("[main] Proxy is un-recoverable. Proxy will not be relaunched.\n");
813       }
814 
815       just_started++;
816     }
817 
818     /* This will catch the case were the proxy dies before it can connect to manager */
819     if (lmgmt->proxy_launch_outstanding && !lmgmt->processRunning() && just_started >= 120) {
820       just_started                    = 0;
821       lmgmt->proxy_launch_outstanding = false;
822       if (lmgmt->proxy_launch_pid != -1) {
823         int res;
824         kill(lmgmt->proxy_launch_pid, 9);
825         waitpid(lmgmt->proxy_launch_pid, &res, 0);
826         if (WIFSIGNALED(res)) {
827           int sig = WTERMSIG(res);
828 #ifdef NEED_PSIGNAL
829           mgmt_log("[main] Proxy terminated due to Sig %d. Relaunching after %d sec...\n", sig, sleep_time);
830 #else
831           mgmt_log("[main] Proxy terminated due to Sig %d: %s. Relaunching after %d sec...\n", sig, strsignal(sig), sleep_time);
832 #endif /* NEED_PSIGNAL */
833         }
834       }
835       mgmt_log("[main] Proxy launch failed, retrying after %d sec...\n", sleep_time);
836     }
837   }
838 
839   // ToDo: Here we should delete anything related to calculated metrics.
840 
841 #ifndef MGMT_SERVICE
842   return 0;
843 #endif
844 
845 } /* End main */
846 
847 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
848 static void
849 SignalAlrmHandler(int /* sig ATS_UNUSED */, siginfo_t *t, void * /* c ATS_UNUSED */)
850 #else
851 static void
852 SignalAlrmHandler(int /* sig ATS_UNUSED */)
853 #endif
854 {
855 /*
856    fprintf("[TrafficManager] ==> SIGALRM received\n");
857    mgmt_elog(0, "[TrafficManager] ==> SIGALRM received\n");
858  */
859 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
860   if (t) {
861     if (t->si_code <= 0) {
862       fprintf(stderr, "[TrafficManager] ==> User Alarm from pid: %ld uid: %d\n", (long)t->si_pid, t->si_uid);
863       mgmt_log("[TrafficManager] ==> User Alarm from pid: %d uid: %d\n", t->si_pid, t->si_uid);
864     } else {
865       fprintf(stderr, "[TrafficManager] ==> Kernel Alarm Reason: %d\n", t->si_code);
866       mgmt_log("[TrafficManager] ==> Kernel Alarm Reason: %d\n", t->si_code);
867     }
868   }
869 #endif
870 
871   return;
872 }
873 
874 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
875 static void
876 SignalHandler(int sig, siginfo_t *t, void *c)
877 #else
878 static void
879 SignalHandler(int sig)
880 #endif
881 {
882   static int clean = 0;
883   int status;
884 
885 #if !defined(linux) && !defined(freebsd) && !defined(darwin)
886   if (t) {
887     if (t->si_code <= 0) {
888       fprintf(stderr, "[TrafficManager] ==> User Sig %d from pid: %ld uid: %d\n", sig, (long)t->si_pid, t->si_uid);
889       mgmt_log("[TrafficManager] ==> User Sig %d from pid: %ld uid: %d\n", sig, (long)t->si_pid, t->si_uid);
890     } else {
891       fprintf(stderr, "[TrafficManager] ==> Kernel Sig %d; Reason: %d\n", sig, t->si_code);
892       mgmt_log("[TrafficManager] ==> Kernel Sig %d; Reason: %d\n", sig, t->si_code);
893     }
894   }
895 #endif
896 
897   if (sig == SIGHUP) {
898     sigHupNotifier = 1;
899     return;
900   }
901 
902   fprintf(stderr, "[TrafficManager] ==> Cleaning up and reissuing signal #%d\n", sig);
903   mgmt_log("[TrafficManager] ==> Cleaning up and reissuing signal #%d\n", sig);
904 
905   if (lmgmt && !clean) {
906     clean = 1;
907     if (lmgmt->watched_process_pid != -1) {
908       if (sig == SIGTERM || sig == SIGINT) {
909         kill(lmgmt->watched_process_pid, sig);
910         waitpid(lmgmt->watched_process_pid, &status, 0);
911       }
912     }
913     lmgmt->mgmtCleanup();
914   }
915 
916   switch (sig) {
917   case SIGQUIT:
918   case SIGILL:
919   case SIGTRAP:
920 #if !defined(linux)
921   case SIGEMT:
922   case SIGSYS:
923 #endif
924   case SIGFPE:
925   case SIGBUS:
926   case SIGSEGV:
927   case SIGXCPU:
928   case SIGXFSZ:
929     abort();
930   default:
931     fprintf(stderr, "[TrafficManager] ==> signal #%d\n", sig);
932     mgmt_log("[TrafficManager] ==> signal #%d\n", sig);
933     ::exit(sig);
934   }
935   fprintf(stderr, "[TrafficManager] ==> signal2 #%d\n", sig);
936   mgmt_log("[TrafficManager] ==> signal2 #%d\n", sig);
937   ::exit(sig);
938 } /* End SignalHandler */
939 
940 // void SigChldHandler(int sig)
941 //
942 //   An empty handler needed so that we catch SIGCHLD
943 //    With Solaris 2.6, ignoring sig child changes the behavior
944 //    of waitpid() so that if there are no unwaited children,
945 //    waitpid() blocks until all child are transformed into
946 //    zombies which is bad for us
947 //
948 static void
949 SigChldHandler(int /* sig ATS_UNUSED */)
950 {
951 }
952 
953 void
954 fileUpdated(char *fname, char *configName)
955 {
956   // If there is no config name recorded, assume this file is not reloadable
957   // Just log a message
958   if (configName == nullptr || configName[0] == '\0') {
959     mgmt_log("[fileUpdated] %s changed, need restart", fname);
960   } else {
961     // Signal based on the config entry that has the changed file name
962     lmgmt->signalFileChange(configName);
963   }
964   return;
965 } /* End fileUpdate */
966 
967 #if TS_USE_POSIX_CAP
968 /** Restore capabilities after user id change.
969     This manipulates LINUX capabilities so that this process
970     can perform certain privileged operations even if it is
971     no longer running as a privilege user.
972 
973     @internal
974     I tried using
975     @code
976     prctl(PR_SET_KEEPCAPS, 1);
977     @endcode
978     but that had no effect even though the call reported success.
979     Only explicit capability manipulation was effective.
980 
981     It does not appear to be necessary to set the capabilities on the
982     executable if originally run as root. That may be needed if
983     started as a user without that capability.
984  */
985 
986 int
987 restoreCapabilities()
988 {
989   int zret      = 0;              // return value.
990   cap_t cap_set = cap_get_proc(); // current capabilities
991   // Make a list of the capabilities we want turned on.
992   cap_value_t cap_list[] = {
993     CAP_NET_ADMIN,        ///< Set socket transparency.
994     CAP_NET_BIND_SERVICE, ///< Low port (e.g. 80) binding.
995     CAP_IPC_LOCK          ///< Lock IPC objects.
996   };
997   static int const CAP_COUNT = sizeof(cap_list) / sizeof(*cap_list);
998 
999   for (int i = 0; i < CAP_COUNT; i++) {
1000     if (cap_set_flag(cap_set, CAP_EFFECTIVE, 1, cap_list + i, CAP_SET) < 0) {
1001       Warning("restore CAP_EFFECTIVE failed for option %d", i);
1002     }
1003     if (cap_set_proc(cap_set) == -1) { // it failed, back out
1004       cap_set_flag(cap_set, CAP_EFFECTIVE, 1, cap_list + i, CAP_CLEAR);
1005     }
1006   }
1007   for (int i : cap_list) {
1008     cap_flag_value_t val;
1009     if (cap_get_flag(cap_set, i, CAP_EFFECTIVE, &val) < 0) {
1010     } else {
1011       Warning("CAP_EFFECTIVE offiset %d is %s", i, val == CAP_SET ? "set" : "unset");
1012     }
1013   }
1014   zret = cap_set_proc(cap_set);
1015   cap_free(cap_set);
1016   return zret;
1017 }
1018 #endif
1019 
1020 //  void runAsUser(...)
1021 //
1022 //  If we are root, switched to user to run as
1023 //    specified in records.config
1024 //
1025 //  If we are not root, do nothing
1026 //
1027 void
1028 runAsUser(const char *userName)
1029 {
1030   if (getuid() == 0 || geteuid() == 0) {
1031     ImpersonateUser(userName, IMPERSONATE_EFFECTIVE);
1032 
1033 #if TS_USE_POSIX_CAP
1034     if (0 != restoreCapabilities()) {
1035       mgmt_log("[runAsUser] Error: Failed to restore capabilities after switch to user %s.\n", userName);
1036     }
1037 #endif
1038   }
1039 } /* End runAsUser() */
1040