//+build linux

package netlink

import (
	"math"
	"os"
	"runtime"
	"sync"
	"syscall"
	"time"
	"unsafe"

	"golang.org/x/net/bpf"
	"golang.org/x/sys/unix"
)

var _ Socket = &conn{}

var _ deadlineSetter = &conn{}

// A conn is the Linux implementation of a netlink sockets connection.
//
// All conn methods must wrap system call errors with os.NewSyscallError to
// enable more intelligible error messages in OpError.
type conn struct {
	s  socket
	sa *unix.SockaddrNetlink
}

// A socket is an interface over socket system calls.
type socket interface {
	Bind(sa unix.Sockaddr) error
	Close() error
	FD() int
	File() *os.File
	Getsockname() (unix.Sockaddr, error)
	Recvmsg(p, oob []byte, flags int) (n int, oobn int, recvflags int, from unix.Sockaddr, err error)
	Sendmsg(p, oob []byte, to unix.Sockaddr, flags int) error
	SetDeadline(t time.Time) error
	SetReadDeadline(t time.Time) error
	SetWriteDeadline(t time.Time) error
	SetSockoptSockFprog(level, opt int, fprog *unix.SockFprog) error
	SetSockoptInt(level, opt, value int) error
}

// dial is the entry point for Dial.  dial opens a netlink socket using
// system calls, and returns its PID.
func dial(family int, config *Config) (*conn, uint32, error) {
	// Prepare sysSocket's internal loop and create the socket.
	//
	// The conditional is inverted because a zero value of false is desired
	// if no config, but it's easier to interpret within this code when the
	// value is inverted.
	if config == nil {
		config = &Config{}
	}

	sock, err := newSysSocket(config)
	if err != nil {
		return nil, 0, err
	}

	if err := sock.Socket(family); err != nil {
		return nil, 0, os.NewSyscallError("socket", err)
	}

	return bind(sock, config)
}

// bind binds a connection to netlink using the input socket, which may be
// a system call implementation or a mocked one for tests.
func bind(s socket, config *Config) (*conn, uint32, error) {
	if config == nil {
		config = &Config{}
	}

	addr := &unix.SockaddrNetlink{
		Family: unix.AF_NETLINK,
		Groups: config.Groups,
	}

	// Socket must be closed in the event of any system call errors, to avoid
	// leaking file descriptors.

	if err := s.Bind(addr); err != nil {
		_ = s.Close()
		return nil, 0, os.NewSyscallError("bind", err)
	}

	sa, err := s.Getsockname()
	if err != nil {
		_ = s.Close()
		return nil, 0, os.NewSyscallError("getsockname", err)
	}

	pid := sa.(*unix.SockaddrNetlink).Pid

	return &conn{
		s:  s,
		sa: addr,
	}, pid, nil
}

// SendMessages serializes multiple Messages and sends them to netlink.
func (c *conn) SendMessages(messages []Message) error {
	var buf []byte
	for _, m := range messages {
		b, err := m.MarshalBinary()
		if err != nil {
			return err
		}

		buf = append(buf, b...)
	}

	addr := &unix.SockaddrNetlink{
		Family: unix.AF_NETLINK,
	}

	return os.NewSyscallError("sendmsg", c.s.Sendmsg(buf, nil, addr, 0))
}

// Send sends a single Message to netlink.
func (c *conn) Send(m Message) error {
	b, err := m.MarshalBinary()
	if err != nil {
		return err
	}

	addr := &unix.SockaddrNetlink{
		Family: unix.AF_NETLINK,
	}

	return os.NewSyscallError("sendmsg", c.s.Sendmsg(b, nil, addr, 0))
}

// Receive receives one or more Messages from netlink.
func (c *conn) Receive() ([]Message, error) {
	b := make([]byte, os.Getpagesize())
	for {
		// Peek at the buffer to see how many bytes are available.
		//
		// TODO(mdlayher): deal with OOB message data if available, such as
		// when PacketInfo ConnOption is true.
		n, _, _, _, err := c.s.Recvmsg(b, nil, unix.MSG_PEEK)
		if err != nil {
			return nil, os.NewSyscallError("recvmsg", err)
		}

		// Break when we can read all messages
		if n < len(b) {
			break
		}

		// Double in size if not enough bytes
		b = make([]byte, len(b)*2)
	}

	// Read out all available messages
	n, _, _, _, err := c.s.Recvmsg(b, nil, 0)
	if err != nil {
		return nil, os.NewSyscallError("recvmsg", err)
	}

	n = nlmsgAlign(n)

	raw, err := syscall.ParseNetlinkMessage(b[:n])
	if err != nil {
		return nil, err
	}

	msgs := make([]Message, 0, len(raw))
	for _, r := range raw {
		m := Message{
			Header: sysToHeader(r.Header),
			Data:   r.Data,
		}

		msgs = append(msgs, m)
	}

	return msgs, nil
}

// Close closes the connection.
func (c *conn) Close() error {
	return os.NewSyscallError("close", c.s.Close())
}

// FD retrieves the file descriptor of the Conn.
func (c *conn) FD() int {
	return c.s.FD()
}

// File retrieves the *os.File associated with the Conn.
func (c *conn) File() *os.File {
	return c.s.File()
}

// JoinGroup joins a multicast group by ID.
func (c *conn) JoinGroup(group uint32) error {
	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_NETLINK,
		unix.NETLINK_ADD_MEMBERSHIP,
		int(group),
	))
}

// LeaveGroup leaves a multicast group by ID.
func (c *conn) LeaveGroup(group uint32) error {
	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_NETLINK,
		unix.NETLINK_DROP_MEMBERSHIP,
		int(group),
	))
}

// SetBPF attaches an assembled BPF program to a conn.
func (c *conn) SetBPF(filter []bpf.RawInstruction) error {
	prog := unix.SockFprog{
		Len:    uint16(len(filter)),
		Filter: (*unix.SockFilter)(unsafe.Pointer(&filter[0])),
	}

	return os.NewSyscallError("setsockopt", c.s.SetSockoptSockFprog(
		unix.SOL_SOCKET,
		unix.SO_ATTACH_FILTER,
		&prog,
	))
}

// RemoveBPF removes a BPF filter from a conn.
func (c *conn) RemoveBPF() error {
	// 0 argument is ignored by SO_DETACH_FILTER.
	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_SOCKET,
		unix.SO_DETACH_FILTER,
		0,
	))
}

// SetOption enables or disables a netlink socket option for the Conn.
func (c *conn) SetOption(option ConnOption, enable bool) error {
	o, ok := linuxOption(option)
	if !ok {
		// Return the typical Linux error for an unknown ConnOption.
		return os.NewSyscallError("setsockopt", unix.ENOPROTOOPT)
	}

	var v int
	if enable {
		v = 1
	}

	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_NETLINK,
		o,
		v,
	))
}

func (c *conn) SetDeadline(t time.Time) error {
	return c.s.SetDeadline(t)
}

func (c *conn) SetReadDeadline(t time.Time) error {
	return c.s.SetReadDeadline(t)
}

func (c *conn) SetWriteDeadline(t time.Time) error {
	return c.s.SetWriteDeadline(t)
}

// SetReadBuffer sets the size of the operating system's receive buffer
// associated with the Conn.
func (c *conn) SetReadBuffer(bytes int) error {
	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_SOCKET,
		unix.SO_RCVBUF,
		bytes,
	))
}

// SetReadBuffer sets the size of the operating system's transmit buffer
// associated with the Conn.
func (c *conn) SetWriteBuffer(bytes int) error {
	return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
		unix.SOL_SOCKET,
		unix.SO_SNDBUF,
		bytes,
	))
}

// linuxOption converts a ConnOption to its Linux value.
func linuxOption(o ConnOption) (int, bool) {
	switch o {
	case PacketInfo:
		return unix.NETLINK_PKTINFO, true
	case BroadcastError:
		return unix.NETLINK_BROADCAST_ERROR, true
	case NoENOBUFS:
		return unix.NETLINK_NO_ENOBUFS, true
	case ListenAllNSID:
		return unix.NETLINK_LISTEN_ALL_NSID, true
	case CapAcknowledge:
		return unix.NETLINK_CAP_ACK, true
	case ExtendedAcknowledge:
		return unix.NETLINK_EXT_ACK, true
	default:
		return 0, false
	}
}

// sysToHeader converts a syscall.NlMsghdr to a Header.
func sysToHeader(r syscall.NlMsghdr) Header {
	// NB: the memory layout of Header and syscall.NlMsgHdr must be
	// exactly the same for this unsafe cast to work
	return *(*Header)(unsafe.Pointer(&r))
}

// newError converts an error number from netlink into the appropriate
// system call error for Linux.
func newError(errno int) error {
	return syscall.Errno(errno)
}

var _ socket = &sysSocket{}

// A sysSocket is a socket which uses system calls for socket operations.
type sysSocket struct {
	mu     sync.RWMutex
	fd     *os.File
	closed bool
	g      *lockedNetNSGoroutine
}

// newSysSocket creates a sysSocket that optionally locks its internal goroutine
// to a single thread.
func newSysSocket(config *Config) (*sysSocket, error) {
	// Determine network namespaces using the threadNetNS function.
	g, err := newLockedNetNSGoroutine(config.NetNS, threadNetNS)
	if err != nil {
		return nil, err
	}
	return &sysSocket{
		g: g,
	}, nil
}

// do runs f in a worker goroutine which can be locked to one thread.
func (s *sysSocket) do(f func()) error {
	// All operations handled by this function are assumed to only
	// read from s.done.
	s.mu.RLock()
	defer s.mu.RUnlock()

	if s.closed {
		return syscall.EBADF
	}

	s.g.run(f)
	return nil
}

// read executes f, a read function, against the associated file descriptor.
func (s *sysSocket) read(f func(fd int) bool) error {
	s.mu.RLock()
	defer s.mu.RUnlock()

	if s.closed {
		return syscall.EBADF
	}

	var err error
	s.g.run(func() {
		err = fdread(s.fd, f)
	})
	return err
}

// write executes f, a write function, against the associated file descriptor.
func (s *sysSocket) write(f func(fd int) bool) error {
	s.mu.RLock()
	defer s.mu.RUnlock()

	if s.closed {
		return syscall.EBADF
	}

	var err error
	s.g.run(func() {
		err = fdwrite(s.fd, f)
	})
	return err
}

// control executes f, a control function, against the associated file descriptor.
func (s *sysSocket) control(f func(fd int)) error {
	s.mu.RLock()
	defer s.mu.RUnlock()

	if s.closed {
		return syscall.EBADF
	}

	var err error
	s.g.run(func() {
		err = fdcontrol(s.fd, f)
	})
	return err
}

func (s *sysSocket) Socket(family int) error {
	var (
		fd  int
		err error
	)

	doErr := s.do(func() {
		// Mirror what the standard library does when creating file
		// descriptors: avoid racing a fork/exec with the creation
		// of new file descriptors, so that child processes do not
		// inherit netlink socket file descriptors unexpectedly.
		//
		// On Linux, SOCK_CLOEXEC was introduced in 2.6.27. OTOH,
		// Go supports Linux 2.6.23 and above. If we get EINVAL on
		// the first try, it may be that we are running on a kernel
		// older than 2.6.27. In that case, take syscall.ForkLock
		// and try again without SOCK_CLOEXEC.
		//
		// SOCK_NONBLOCK was also added in 2.6.27, but we don't
		// use SOCK_NONBLOCK here for now, not until we remove support
		// for Go 1.11, since we still support the old blocking file
		// descriptor behavior.
		//
		// For a more thorough explanation, see similar work in the
		// Go tree: func sysSocket in net/sock_cloexec.go, as well
		// as the detailed comment in syscall/exec_unix.go.
		//
		// TODO(acln): update this to mirror net.sysSocket completely:
		// use SOCK_NONBLOCK as well, and remove the separate
		// setBlockingMode step once Go 1.11 support is removed and
		// we switch to using entirely non-blocking file descriptors.
		fd, err = unix.Socket(
			unix.AF_NETLINK,
			unix.SOCK_RAW|unix.SOCK_CLOEXEC,
			family,
		)
		if err == unix.EINVAL {
			syscall.ForkLock.RLock()
			fd, err = unix.Socket(
				unix.AF_NETLINK,
				unix.SOCK_RAW,
				family,
			)
			if err == nil {
				unix.CloseOnExec(fd)
			}
			syscall.ForkLock.RUnlock()
		}
	})
	if doErr != nil {
		return doErr
	}
	if err != nil {
		return err
	}

	if err := setBlockingMode(fd); err != nil {
		return err
	}

	// When using Go 1.12+, the setBlockingMode call we just did puts the
	// file descriptor into non-blocking mode. In that case, os.NewFile
	// registers the file descriptor with the runtime poller, which is
	// then used for all subsequent operations.
	//
	// See also: https://golang.org/pkg/os/#NewFile
	s.fd = os.NewFile(uintptr(fd), "netlink")
	return nil
}

func (s *sysSocket) Bind(sa unix.Sockaddr) error {
	var err error
	doErr := s.control(func(fd int) {
		err = unix.Bind(fd, sa)
	})
	if doErr != nil {
		return doErr
	}

	return err
}

func (s *sysSocket) Close() error {
	// Be sure to acquire a write lock because we need to stop any other
	// goroutines from sending system call requests after close.
	// Any invocation of do() after this write lock unlocks is guaranteed
	// to find s.done being true.
	s.mu.Lock()
	defer s.mu.Unlock()

	// Close the socket from the main thread, this operation has no risk
	// of routing data to the wrong socket.
	err := s.fd.Close()
	s.closed = true

	// Stop the associated goroutine and wait for it to return.
	s.g.stop()

	return err
}

func (s *sysSocket) FD() int { return int(s.fd.Fd()) }

func (s *sysSocket) File() *os.File { return s.fd }

func (s *sysSocket) Getsockname() (unix.Sockaddr, error) {
	var (
		sa  unix.Sockaddr
		err error
	)

	doErr := s.control(func(fd int) {
		sa, err = unix.Getsockname(fd)
	})
	if doErr != nil {
		return nil, doErr
	}

	return sa, err
}

func (s *sysSocket) Recvmsg(p, oob []byte, flags int) (int, int, int, unix.Sockaddr, error) {
	var (
		n, oobn, recvflags int
		from               unix.Sockaddr
		err                error
	)

	doErr := s.read(func(fd int) bool {
		n, oobn, recvflags, from, err = unix.Recvmsg(fd, p, oob, flags)

		// When the socket is in non-blocking mode, we might see
		// EAGAIN and end up here. In that case, return false to
		// let the poller wait for readiness. See the source code
		// for internal/poll.FD.RawRead for more details.
		//
		// If the socket is in blocking mode, EAGAIN should never occur.
		return err != syscall.EAGAIN
	})
	if doErr != nil {
		return 0, 0, 0, nil, doErr
	}

	return n, oobn, recvflags, from, err
}

func (s *sysSocket) Sendmsg(p, oob []byte, to unix.Sockaddr, flags int) error {
	var err error
	doErr := s.write(func(fd int) bool {
		err = unix.Sendmsg(fd, p, oob, to, flags)

		// Analogous to Recvmsg. See the comments there.
		return err != syscall.EAGAIN
	})
	if doErr != nil {
		return doErr
	}

	return err
}

func (s *sysSocket) SetDeadline(t time.Time) error {
	return s.fd.SetDeadline(t)
}

func (s *sysSocket) SetReadDeadline(t time.Time) error {
	return s.fd.SetReadDeadline(t)
}

func (s *sysSocket) SetWriteDeadline(t time.Time) error {
	return s.fd.SetWriteDeadline(t)
}

func (s *sysSocket) SetSockoptInt(level, opt, value int) error {
	// Value must be in range of a C integer.
	if value < math.MinInt32 || value > math.MaxInt32 {
		return unix.EINVAL
	}

	var err error
	doErr := s.control(func(fd int) {
		err = unix.SetsockoptInt(fd, level, opt, value)
	})
	if doErr != nil {
		return doErr
	}

	return err
}

func (s *sysSocket) SetSockoptSockFprog(level, opt int, fprog *unix.SockFprog) error {
	var err error
	doErr := s.control(func(fd int) {
		err = unix.SetsockoptSockFprog(fd, level, opt, fprog)
	})
	if doErr != nil {
		return doErr
	}

	return err
}

// lockedNetNSGoroutine is a worker goroutine locked to an operating system
// thread, optionally configured to run in a non-default network namespace.
type lockedNetNSGoroutine struct {
	wg    sync.WaitGroup
	doneC chan struct{}
	funcC chan func()
}

// newLockedNetNSGoroutine creates a lockedNetNSGoroutine that will enter the
// specified network namespace netNS (by file descriptor), and will use the
// getNS function to produce netNS handles.
func newLockedNetNSGoroutine(netNS int, getNS func() (*netNS, error)) (*lockedNetNSGoroutine, error) {
	// Any bare syscall errors (e.g. setns) should be wrapped with
	// os.NewSyscallError for the remainder of this function.

	callerNS, err := getNS()
	if err != nil {
		return nil, err
	}
	defer callerNS.Close()

	g := &lockedNetNSGoroutine{
		doneC: make(chan struct{}),
		funcC: make(chan func()),
	}

	errC := make(chan error)
	g.wg.Add(1)

	go func() {
		// It is important to lock this goroutine to its OS thread for the duration
		// of the netlink socket being used, or else the kernel may end up routing
		// messages to the wrong places.
		// See: http://lists.infradead.org/pipermail/libnl/2017-February/002293.html.
		//
		//
		// In addition, the OS thread must also remain locked because we attempt
		// to manipulate the network namespace of the thread within this goroutine.
		//
		// The intent is to never unlock the OS thread, so that the thread
		// will terminate when the goroutine exits starting in Go 1.10:
		// https://go-review.googlesource.com/c/go/+/46038.
		//
		// However, due to recent instability and a potential bad interaction
		// with the Go runtime for threads which are not unlocked, we have
		// elected to temporarily unlock the thread when the goroutine terminates:
		// https://github.com/golang/go/issues/25128#issuecomment-410764489.

		runtime.LockOSThread()
		defer runtime.UnlockOSThread()
		defer g.wg.Done()

		// Get the current namespace of the thread the goroutine is locked to.
		threadNS, err := getNS()
		if err != nil {
			errC <- err
			return
		}
		defer threadNS.Close()

		// Attempt to set the network namespace of the current thread to either:
		// - the namespace referred to by the provided file descriptor from config
		// - the calling thread's namespace
		//
		// See the rules specified in the Config.NetNS documentation.
		explicitNS := true
		if netNS == 0 {
			explicitNS = false
			netNS = int(callerNS.FD())
		}

		// Only return an error if the network namespace was explicitly
		// configured; implicit configuration by zero value should be ignored.
		err = threadNS.Set(netNS)
		switch {
		case err != nil && explicitNS:
			errC <- err
			return
		case err == nil:
			// If the thread's namespace has been successfully manipulated,
			// make sure we change it back when the goroutine returns.
			defer threadNS.Restore()
		default:
			// We couldn't successfully set the namespace, but the caller didn't
			// explicitly ask for it to be set either. Continue.
		}

		// Signal to caller that initialization was successful.
		errC <- nil

		for {
			select {
			case <-g.doneC:
				return
			case f := <-g.funcC:
				f()
			}
		}
	}()

	// Wait for the goroutine to return err or nil.
	if err := <-errC; err != nil {
		return nil, err
	}

	return g, nil
}

// stop signals the goroutine to stop and blocks until it does.
//
// It is invalid to call run concurrently with stop. It is also invalid to
// call run after stop has returned.
func (g *lockedNetNSGoroutine) stop() {
	close(g.doneC)
	g.wg.Wait()
}

// run runs f on the worker goroutine.
func (g *lockedNetNSGoroutine) run(f func()) {
	done := make(chan struct{})
	g.funcC <- func() {
		defer close(done)
		f()
	}
	<-done
}