syncthing/cmd/syncthing/monitor.go

597 lines
15 KiB
Go

// Copyright (C) 2014 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package main
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"time"
"github.com/syncthing/syncthing/lib/events"
"github.com/syncthing/syncthing/lib/fs"
"github.com/syncthing/syncthing/lib/locations"
"github.com/syncthing/syncthing/lib/osutil"
"github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/svcutil"
"github.com/syncthing/syncthing/lib/sync"
)
var (
stdoutFirstLines []string // The first 10 lines of stdout
stdoutLastLines []string // The last 50 lines of stdout
stdoutMut = sync.NewMutex()
)
const (
restartCounts = 4
restartPause = 1 * time.Second
restartLoopThreshold = 60 * time.Second
logFileAutoCloseDelay = 5 * time.Second
logFileMaxOpenTime = time.Minute
panicUploadMaxWait = 30 * time.Second
panicUploadNoticeWait = 10 * time.Second
)
func monitorMain(options serveOptions) {
l.SetPrefix("[monitor] ")
var dst io.Writer = os.Stdout
logFile := options.LogFile
if logFile != "-" {
if expanded, err := fs.ExpandTilde(logFile); err == nil {
logFile = expanded
}
var fileDst io.Writer
var err error
open := func(name string) (io.WriteCloser, error) {
return newAutoclosedFile(name, logFileAutoCloseDelay, logFileMaxOpenTime)
}
if options.LogMaxSize > 0 {
fileDst, err = newRotatedFile(logFile, open, int64(options.LogMaxSize), options.LogMaxFiles)
} else {
fileDst, err = open(logFile)
}
if err != nil {
l.Warnln("Failed to setup logging to file, proceeding with logging to stdout only:", err)
} else {
if runtime.GOOS == "windows" {
// Translate line breaks to Windows standard
fileDst = osutil.ReplacingWriter{
Writer: fileDst,
From: '\n',
To: []byte{'\r', '\n'},
}
}
// Log to both stdout and file.
dst = io.MultiWriter(dst, fileDst)
l.Infof(`Log output saved to file "%s"`, logFile)
}
}
args := os.Args
var restarts [restartCounts]time.Time
stopSign := make(chan os.Signal, 1)
signal.Notify(stopSign, os.Interrupt, sigTerm)
restartSign := make(chan os.Signal, 1)
sigHup := syscall.Signal(1)
signal.Notify(restartSign, sigHup)
childEnv := childEnv()
first := true
for {
maybeReportPanics()
if t := time.Since(restarts[0]); t < restartLoopThreshold {
l.Warnf("%d restarts in %v; not retrying further", restartCounts, t)
os.Exit(svcutil.ExitError.AsInt())
}
copy(restarts[0:], restarts[1:])
restarts[len(restarts)-1] = time.Now()
cmd := exec.Command(args[0], args[1:]...)
cmd.Env = childEnv
stderr, err := cmd.StderrPipe()
if err != nil {
panic(err)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
panic(err)
}
l.Debugln("Starting syncthing")
err = cmd.Start()
if err != nil {
l.Warnln("Error starting the main Syncthing process:", err)
panic("Error starting the main Syncthing process")
}
stdoutMut.Lock()
stdoutFirstLines = make([]string, 0, 10)
stdoutLastLines = make([]string, 0, 50)
stdoutMut.Unlock()
wg := sync.NewWaitGroup()
wg.Add(1)
go func() {
copyStderr(stderr, dst)
wg.Done()
}()
wg.Add(1)
go func() {
copyStdout(stdout, dst)
wg.Done()
}()
exit := make(chan error)
go func() {
wg.Wait()
exit <- cmd.Wait()
}()
stopped := false
select {
case s := <-stopSign:
l.Infof("Signal %d received; exiting", s)
cmd.Process.Signal(sigTerm)
err = <-exit
stopped = true
case s := <-restartSign:
l.Infof("Signal %d received; restarting", s)
cmd.Process.Signal(sigHup)
err = <-exit
case err = <-exit:
}
if err == nil {
// Successful exit indicates an intentional shutdown
os.Exit(svcutil.ExitSuccess.AsInt())
}
if exiterr, ok := err.(*exec.ExitError); ok {
exitCode := exiterr.ExitCode()
if stopped || options.NoRestart {
os.Exit(exitCode)
}
if exitCode == svcutil.ExitUpgrade.AsInt() {
// Restart the monitor process to release the .old
// binary as part of the upgrade process.
l.Infoln("Restarting monitor...")
if err = restartMonitor(args); err != nil {
l.Warnln("Restart:", err)
}
os.Exit(exitCode)
}
}
if options.NoRestart {
os.Exit(svcutil.ExitError.AsInt())
}
l.Infoln("Syncthing exited:", err)
time.Sleep(restartPause)
if first {
// Let the next child process know that this is not the first time
// it's starting up.
childEnv = append(childEnv, "STRESTART=yes")
first = false
}
}
}
func copyStderr(stderr io.Reader, dst io.Writer) {
br := bufio.NewReader(stderr)
var panicFd *os.File
defer func() {
if panicFd != nil {
_ = panicFd.Close()
maybeReportPanics()
}
}()
for {
line, err := br.ReadString('\n')
if err != nil {
return
}
if panicFd == nil {
dst.Write([]byte(line))
if strings.Contains(line, "SIGILL") {
l.Warnln(`
*******************************************************************************
* Crash due to illegal instruction detected. This is most likely due to a CPU *
* incompatibility with the high performance hashing package. Switching to the *
* standard hashing package instead. Please report this issue at: *
* *
* https://github.com/syncthing/syncthing/issues *
* *
* Include the details of your CPU. *
*******************************************************************************
`)
os.Setenv("STHASHING", "standard")
return
}
if strings.HasPrefix(line, "panic:") || strings.HasPrefix(line, "fatal error:") {
panicFd, err = os.Create(locations.GetTimestamped(locations.PanicLog))
if err != nil {
l.Warnln("Create panic log:", err)
continue
}
l.Warnf("Panic detected, writing to \"%s\"", panicFd.Name())
if strings.Contains(line, "leveldb") && strings.Contains(line, "corrupt") {
l.Warnln(`
*********************************************************************************
* Crash due to corrupt database. *
* *
* This crash usually occurs due to one of the following reasons: *
* - Syncthing being stopped abruptly (killed/loss of power) *
* - Bad hardware (memory/disk issues) *
* - Software that affects disk writes (SSD caching software and simillar) *
* *
* Please see the following URL for instructions on how to recover: *
* https://docs.syncthing.net/users/faq.html#my-syncthing-database-is-corrupt *
*********************************************************************************
`)
} else {
l.Warnln("Please check for existing issues with similar panic message at https://github.com/syncthing/syncthing/issues/")
l.Warnln("If no issue with similar panic message exists, please create a new issue with the panic log attached")
}
stdoutMut.Lock()
for _, line := range stdoutFirstLines {
panicFd.WriteString(line)
}
panicFd.WriteString("...\n")
for _, line := range stdoutLastLines {
panicFd.WriteString(line)
}
stdoutMut.Unlock()
}
panicFd.WriteString("Panic at " + time.Now().Format(time.RFC3339) + "\n")
}
if panicFd != nil {
panicFd.WriteString(line)
}
}
}
func copyStdout(stdout io.Reader, dst io.Writer) {
br := bufio.NewReader(stdout)
for {
line, err := br.ReadString('\n')
if err != nil {
return
}
stdoutMut.Lock()
if len(stdoutFirstLines) < cap(stdoutFirstLines) {
stdoutFirstLines = append(stdoutFirstLines, line)
} else {
if l := len(stdoutLastLines); l == cap(stdoutLastLines) {
stdoutLastLines = stdoutLastLines[:l-1]
}
stdoutLastLines = append(stdoutLastLines, line)
}
stdoutMut.Unlock()
dst.Write([]byte(line))
}
}
func restartMonitor(args []string) error {
// Set the STRESTART environment variable to indicate to the next
// process that this is a restart and not initial start. This prevents
// opening the browser on startup.
os.Setenv("STRESTART", "yes")
if runtime.GOOS != "windows" {
// syscall.Exec is the cleanest way to restart on Unixes as it
// replaces the current process with the new one, keeping the pid and
// controlling terminal and so on
return restartMonitorUnix(args)
}
// but it isn't supported on Windows, so there we start a normal
// exec.Command and return.
return restartMonitorWindows(args)
}
func restartMonitorUnix(args []string) error {
if !strings.ContainsRune(args[0], os.PathSeparator) {
// The path to the binary doesn't contain a slash, so it should be
// found in $PATH.
binary, err := exec.LookPath(args[0])
if err != nil {
return err
}
args[0] = binary
}
return syscall.Exec(args[0], args, os.Environ())
}
func restartMonitorWindows(args []string) error {
cmd := exec.Command(args[0], args[1:]...)
// Retain the standard streams
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout
cmd.Stdin = os.Stdin
return cmd.Start()
}
// rotatedFile keeps a set of rotating logs. There will be the base file plus up
// to maxFiles rotated ones, each ~ maxSize bytes large.
type rotatedFile struct {
name string
create createFn
maxSize int64 // bytes
maxFiles int
currentFile io.WriteCloser
currentSize int64
}
type createFn func(name string) (io.WriteCloser, error)
func newRotatedFile(name string, create createFn, maxSize int64, maxFiles int) (*rotatedFile, error) {
var size int64
if info, err := os.Lstat(name); err != nil {
if !os.IsNotExist(err) {
return nil, err
}
size = 0
} else {
size = info.Size()
}
writer, err := create(name)
if err != nil {
return nil, err
}
return &rotatedFile{
name: name,
create: create,
maxSize: maxSize,
maxFiles: maxFiles,
currentFile: writer,
currentSize: size,
}, nil
}
func (r *rotatedFile) Write(bs []byte) (int, error) {
// Check if we're about to exceed the max size, and if so close this
// file so we'll start on a new one.
if r.currentSize+int64(len(bs)) > r.maxSize {
r.currentFile.Close()
r.currentSize = 0
r.rotate()
f, err := r.create(r.name)
if err != nil {
return 0, err
}
r.currentFile = f
}
n, err := r.currentFile.Write(bs)
r.currentSize += int64(n)
return n, err
}
func (r *rotatedFile) rotate() {
// The files are named "name", "name.0", "name.1", ...
// "name.(r.maxFiles-1)". Increase the numbers on the
// suffixed ones.
for i := r.maxFiles - 1; i > 0; i-- {
from := numberedFile(r.name, i-1)
to := numberedFile(r.name, i)
err := os.Rename(from, to)
if err != nil && !os.IsNotExist(err) {
fmt.Println("LOG: Rotating logs:", err)
}
}
// Rename the base to base.0
err := os.Rename(r.name, numberedFile(r.name, 0))
if err != nil && !os.IsNotExist(err) {
fmt.Println("LOG: Rotating logs:", err)
}
}
// numberedFile adds the number between the file name and the extension.
func numberedFile(name string, num int) string {
ext := filepath.Ext(name) // contains the dot
withoutExt := name[:len(name)-len(ext)]
return fmt.Sprintf("%s.%d%s", withoutExt, num, ext)
}
// An autoclosedFile is an io.WriteCloser that opens itself for appending on
// Write() and closes itself after an interval of no writes (closeDelay) or
// when the file has been open for too long (maxOpenTime). A call to Write()
// will return any error that happens on the resulting Open() call too. Errors
// on automatic Close() calls are silently swallowed...
type autoclosedFile struct {
name string // path to write to
closeDelay time.Duration // close after this long inactivity
maxOpenTime time.Duration // or this long after opening
fd io.WriteCloser // underlying WriteCloser
opened time.Time // timestamp when the file was last opened
closed chan struct{} // closed on Close(), stops the closerLoop
closeTimer *time.Timer // fires closeDelay after a write
mut sync.Mutex
}
func newAutoclosedFile(name string, closeDelay, maxOpenTime time.Duration) (*autoclosedFile, error) {
f := &autoclosedFile{
name: name,
closeDelay: closeDelay,
maxOpenTime: maxOpenTime,
mut: sync.NewMutex(),
closed: make(chan struct{}),
closeTimer: time.NewTimer(time.Minute),
}
f.mut.Lock()
defer f.mut.Unlock()
if err := f.ensureOpenLocked(); err != nil {
return nil, err
}
go f.closerLoop()
return f, nil
}
func (f *autoclosedFile) Write(bs []byte) (int, error) {
f.mut.Lock()
defer f.mut.Unlock()
// Make sure the file is open for appending
if err := f.ensureOpenLocked(); err != nil {
return 0, err
}
// If we haven't run into the maxOpenTime, postpone close for another
// closeDelay
if time.Since(f.opened) < f.maxOpenTime {
f.closeTimer.Reset(f.closeDelay)
}
return f.fd.Write(bs)
}
func (f *autoclosedFile) Close() error {
f.mut.Lock()
defer f.mut.Unlock()
// Stop the timer and closerLoop() routine
f.closeTimer.Stop()
close(f.closed)
// Close the file, if it's open
if f.fd != nil {
return f.fd.Close()
}
return nil
}
// Must be called with f.mut held!
func (f *autoclosedFile) ensureOpenLocked() error {
if f.fd != nil {
// File is already open
return nil
}
// We open the file for write only, and create it if it doesn't exist.
flags := os.O_WRONLY | os.O_CREATE | os.O_APPEND
fd, err := os.OpenFile(f.name, flags, 0644)
if err != nil {
return err
}
f.fd = fd
f.opened = time.Now()
return nil
}
func (f *autoclosedFile) closerLoop() {
for {
select {
case <-f.closeTimer.C:
// Close the file when the timer expires.
f.mut.Lock()
if f.fd != nil {
f.fd.Close() // errors, schmerrors
f.fd = nil
}
f.mut.Unlock()
case <-f.closed:
return
}
}
}
// Returns the desired child environment, properly filtered and added to.
func childEnv() []string {
var env []string
for _, str := range os.Environ() {
if strings.HasPrefix(str, "STNORESTART=") {
continue
}
if strings.HasPrefix(str, "STMONITORED=") {
continue
}
env = append(env, str)
}
env = append(env, "STMONITORED=yes")
return env
}
// maybeReportPanics tries to figure out if crash reporting is on or off,
// and reports any panics it can find if it's enabled. We spend at most
// panicUploadMaxWait uploading panics...
func maybeReportPanics() {
// Try to get a config to see if/where panics should be reported.
cfg, err := loadOrDefaultConfig(protocol.EmptyDeviceID, events.NoopLogger)
if err != nil {
l.Warnln("Couldn't load config; not reporting crash")
return
}
// Bail if we're not supposed to report panics.
opts := cfg.Options()
if !opts.CREnabled {
return
}
// Set up a timeout on the whole operation.
ctx, cancel := context.WithTimeout(context.Background(), panicUploadMaxWait)
defer cancel()
// Print a notice if the upload takes a long time.
go func() {
select {
case <-ctx.Done():
return
case <-time.After(panicUploadNoticeWait):
l.Warnln("Uploading crash reports is taking a while, please wait...")
}
}()
// Report the panics.
dir := locations.GetBaseDir(locations.ConfigBaseDir)
uploadPanicLogs(ctx, opts.CRURL, dir)
}