#!/usr/bin/python3 -u
# -*- mode: Python; coding: utf-8 -*-

# Fix issues with missing OSTree objects
#
# Copyright (C) 2017  Endless Mobile, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

"""Fix issues with OSTree missing objects

When OSTree repository objects have been inadvertently deleted, it can
cause two types of problems (among others):

1. If the deleted object is part of a commit, then the commit is now
partial, but OSTree doesn't know that unless a commitpartial file
exists. Without that, it will assume the commit is fully intact and use
it as the source for a static delta.

2. If the deleted object is a commit, then any references to it will be
dangling. This will cause errors since OSTree assumes that a referenced
commit will exist and will raise errors as soon as it tries to be used.

This script attempts to address these 2 issues by repulling the commits
for any dangling references and marking any commits with missing objects
as partial.

To guard against another program operating on the repository, all
processes that have the repository open are killed.

Some commands for testing this script when hacking on it (from a throwaway
working system):

```
for objtype in commit dirtree dirmeta file; do
  sudo find /ostree/repo/objects -type f -name "*.${objtype}" -print -delete -quit
done
sudo eos-fix-ostree-repo
sudo ostree fsck
```

Make sure to test on normal and split-disk systems.
"""

from argparse import ArgumentParser
from fnmatch import fnmatch
import gi
gi.require_version('OSTree', '1.0')
from gi.repository import GLib, Gio, OSTree
import os
import pwd
import signal
import stat
import sys
import time


# Older OSTree versions had the GI annotation wrong, and the enum
# value was OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL while
# recent versions have OSTree.RepoCommitState.PARTIAL [1].
#
# So we try to load OSTree.RepoCommitState.PARTIAL and, if that fails,
# override OSTree.RepoCommitState.PARTIAL to be the old enum value.
#
# [1] See https://github.com/ostreedev/ostree/pull/1335
if 'PARTIAL' not in dir(OSTree.RepoCommitState):
    OSTree.RepoCommitState.PARTIAL = OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL


# Prior to ostree-2017.1, the GI annotation for ostree_repo_list_objects
# was wrong, making it unusable from bindings[1]. This is tricky to
# detect since the version checking was not added until ostree-2017.3.
# However, in the same commit,
# ostree_repo_list_commit_objects_starting_with was fixed so that the
# out_commits parameter was marked properly.
#
# Use the direction of this argument as a proxy to know when
# ostree_repo_list_objects will work. Otherwise, make our own
# list_objects implementation and monkey-patch it into the Repo class.
#
# 1. https://github.com/ostreedev/ostree/commit/300752e5
_func = OSTree.Repo.list_commit_objects_starting_with
if _func.get_arguments()[1].get_direction() == gi._gi.Direction.IN:
    import glob

    # Fake ostree_repo_list_objects. See
    # https://github.com/ostreedev/ostree/blob/master/src/libostree/ostree-repo.c
    # for the real implementation.
    def _list_objects(self, flags, cancellable=None):
        objects = {}
        repo_path = self.get_path().get_path()
        repo_mode = self.get_mode()

        # Objects live in the objects directory split after the 2nd
        # character of their sha256sum. E.g.,
        # 8d/2925839245dd91ac9fbfcc0e7a383cddf5145bd7c5bc5de0d46929a3fa5963.file.
        objdir_pattern = repo_path + '/objects/[a-f0-9][a-f0-9]'
        for objdir in glob.iglob(objdir_pattern):
            for entry in os.listdir(objdir):
                name, ext = os.path.splitext(entry)

                if len(name) != 62:
                    # Not a partial sha256
                    continue

                if ext == '':
                    continue
                elif (ext == '.filez' and
                      repo_mode == OSTree.RepoMode.ARCHIVE_Z2):
                    objtype = OSTree.ObjectType.FILE
                elif (ext == '.file' and
                      repo_mode != OSTree.RepoMode.ARCHIVE_Z2):
                    objtype = OSTree.ObjectType.FILE
                elif ext == '.dirtree':
                    objtype = OSTree.ObjectType.DIR_TREE
                elif ext == '.dirmeta':
                    objtype = OSTree.ObjectType.DIR_META
                elif ext == '.commit':
                    objtype = OSTree.ObjectType.COMMIT
                else:
                    continue

                # Insert the object. The key is the serialized object
                # name and the value is always the same (bas) variant (I
                # think packed objects were supposed to put something
                # here, but only loose objects ever exist).
                checksum = os.path.basename(objdir) + name
                key = OSTree.object_name_serialize(checksum, objtype)
                value = GLib.Variant.new_tuple(
                    GLib.Variant('b', True),
                    GLib.Variant('as', [])
                )
                objects[key] = value

        return True, objects

    # Override the standard list_objects
    OSTree.Repo.list_objects = _list_objects


def kill_repo_procs(repo_path, sig):
    """Kill all processes with repo open

    Walk /proc to find any process with the repo directory open and kill
    them with signal sig.
    """
    print('Killing processes with', repo_path, 'open with signal', sig)

    self_pid = os.getpid()
    for pid in os.listdir('/proc'):
        if not pid.isnumeric():
            continue
        if int(pid) == self_pid:
            continue

        # The process may have exited
        try:
            proc_fds = os.listdir(os.path.join('/proc', pid, 'fd'))
        except FileNotFoundError:
            continue

        for fd in proc_fds:
            # The process may have exited or the file may have been closed
            try:
                fd_path = os.readlink(os.path.join('/proc', pid, 'fd', fd))
            except FileNotFoundError:
                continue

            # If the open file is the repo or a path within the repo,
            # kill the process
            if fd_path == repo_path or fd_path.startswith(repo_path + '/'):
                # Try to read the exe file for information, but in some
                # cases (kernel thread), it may not exist
                try:
                    pid_exe = os.readlink(os.path.join('/proc', pid, 'exe'))
                except:
                    pid_exe = ''

                # Kill it and go to the next process
                print('Killing pid', pid, pid_exe, 'with signal', sig)
                os.kill(int(pid), sig)
                break


def pull_commit(repo, remote, checksum, full=False):
    """Pull commit from remote

    When full is False, only the commit metadata will be pulled.
    """
    if full:
        flags = OSTree.RepoPullFlags.NONE
    else:
        flags = OSTree.RepoPullFlags.COMMIT_ONLY
    opts = GLib.Variant('a{sv}', {
        'flags': GLib.Variant('i', flags),
        'refs': GLib.Variant('as', (checksum,)),
        'depth': GLib.Variant('i', 0),
    })

    # FIXME: For some reason, pull_with_options cannot be stopped with
    # ^C from the keyboard (SIGINT). This could be a problem in ostree
    # or pygobject, but I suspect it has something to do with what pull
    # does with the main context.
    progress = OSTree.AsyncProgress.new()
    progress.connect('changed',
                     OSTree.Repo.pull_default_console_progress_changed,
                     None)
    repo.pull_with_options(remote, opts, progress)
    progress.finish()


def fix_dangling_refs(repo):
    """Update repo refs where the commit is missing

    This does a commit metadata only pull so the refs are valid again.
    """
    repo_path = os.path.realpath(repo.get_path().get_path())
    print('Fixing refs pointing to missing commits in', repo_path)

    _, all_refs = repo.list_refs()
    for refspec, checksum in all_refs.items():
        try:
            repo.load_commit(checksum)
        except GLib.Error as err:
            if not err.matches(Gio.io_error_quark(),
                               Gio.IOErrorEnum.NOT_FOUND):
                raise

            # Try to pull the commit metadata again.
            _, remote, ref = OSTree.parse_refspec(refspec)
            if remote is None:
                # If there's no remote, assume it's an ostree ref and
                # use "eos" as the remote.
                print('No remote for ref', ref, 'assuming "eos"')
                remote = 'eos'
            print('Pulling', checksum, 'commit metadata from', remote,
                  'for', ref)
            pull_commit(repo, remote, checksum)


def mark_commits_partial(repo):
    """Mark commits with missing objects as partial"""
    repo_path = os.path.realpath(repo.get_path().get_path())
    print('Marking commits with missing objects as partial in', repo_path)

    _, all_objects = repo.list_objects(OSTree.RepoListObjectsFlags.ALL, None)
    for objname in all_objects:
        checksum, objtype = OSTree.object_name_deserialize(objname)
        if objtype != OSTree.ObjectType.COMMIT:
            continue
        _, commit, state = repo.load_commit(checksum)
        if state == OSTree.RepoCommitState.PARTIAL:
            print('Commit', checksum, 'already marked as partial')
            continue

        mark_partial = False
        try:
            # If a dirtree is missing, traverse_commit will fail with
            # G_IO_ERROR_NOT_FOUND.
            _, reachable_objects = repo.traverse_commit(checksum, 0)

            # Unfortunately, it doesn't check that the leaves (dirmeta
            # and files) exist, so we need to do that manually. In case
            # that behavior ever changes, just check that all the
            # reachable objects exist.
            #
            # https://github.com/ostreedev/ostree/issues/1222
            for commit_obj in reachable_objects:
                if commit_obj not in all_objects:
                    mark_partial = True
                    break
        except GLib.Error as err:
            if not err.matches(Gio.io_error_quark(),
                               Gio.IOErrorEnum.NOT_FOUND):
                raise
            mark_partial = True

        if mark_partial:
            print('Marking commit', checksum, 'as partial')
            commit_partial_path = os.path.join(repo_path, 'state',
                                               checksum + '.commitpartial')
            with open(commit_partial_path, 'w'):
                pass


def pull_partial_refs(repo):
    """Try to fully restore any partial referenced commits"""
    # Look for any partial refs and re-pull them.
    _, all_refs = repo.list_refs()
    for refspec, checksum in all_refs.items():
        _, remote, ref = OSTree.parse_refspec(refspec)
        if remote is None:
            # Don't bother pulling local refs. Only the ostree deploys
            # are local, and as long as they're marked partial, they can
            # be updated later.
            continue

        # If this is an app or runtime locale, it's intentionally
        # partial since only the relevant subpaths are pulled. Skip it
        # to not use up extra bandwidth and disk space.
        if fnmatch(ref, '*/*.Locale/*/*'):
            print('Skipping intentionally partial Locale commit',
                  refspec, checksum)
            continue

        _, commit, state = repo.load_commit(checksum)
        if state != OSTree.RepoCommitState.PARTIAL:
            continue

        # Try to pull the full commit again.
        print('Pulling', checksum, 'commit from', remote, 'for', ref)
        pull_commit(repo, remote, checksum, full=True)


def main():
    aparser = ArgumentParser(
        description='Fix broken OSTree repo'
    )
    path_group = aparser.add_mutually_exclusive_group()
    path_group.add_argument('--sysroot', help='path to OSTree sysroot')
    path_group.add_argument('--repo', help='path to OSTree repo')
    args = aparser.parse_args()

    if os.geteuid() != 0:
        print('Must be root to run', sys.argv[0], file=sys.stderr)
        sys.exit(1)

    print('WARNING: Do not start App Center while this is running')

    if args.repo is not None:
        # Use a repo directly instead of getting it from the sysroot
        sysroot = None
        repo_file = Gio.File.new_for_path(args.repo)
        repo = OSTree.Repo.new(repo_file)
        repo.open()
    else:
        # Get the repo from the sysroot
        if args.sysroot is None:
            sysroot_file = None
        else:
            sysroot_file = Gio.File.new_for_path(args.sysroot)
        sysroot = OSTree.Sysroot.new(sysroot_file)
        sysroot.load()
        _, repo = sysroot.get_repo()

    # Resolve the full repo path
    repo_path = os.path.realpath(repo.get_path().get_path())

    # Must be running as the owner of the repo. We don't want to make
    # root owned files in a non-root owned repo.
    repo_uid = os.stat(repo_path).st_uid
    if os.geteuid() != repo_uid:
        # Try to get the repo owner's name
        try:
            repo_user = pwd.getpwuid(repo_uid).pw_name
        except:
            repo_user = repo_uid

        print(repo_path, 'is owned by', repo_user, 'not root',
              file=sys.stderr)
        sys.exit(1)

    # Kill once with SIGTERM, then with SIGKILL
    kill_repo_procs(repo_path, signal.SIGTERM)
    time.sleep(1)
    kill_repo_procs(repo_path, signal.SIGKILL)

    # Now lock the sysroot if one is in use
    if sysroot is not None and not sysroot.try_lock():
        print('Could not lock sysroot', sysroot.get_path().get_path(),
              file=sys.stderr)
        sys.exit(1)

    # In older OSTree, cleaning up after a transaction (e.g., a pull)
    # would delete the tmp/cache directory if it was older than 1 day.
    # That's a problem because it has an open fd for that directory.
    # Update the directory's mtime to current. This is racy because
    # other repo users may have deleted the directory after we opened
    # the repo and before they were killed, so just fail if the
    # directory doesn't exist.
    cache_dir = os.path.join(repo_path, 'tmp', 'cache')
    try:
        os.utime(cache_dir)
    except FileNotFoundError:
        print(cache_dir, 'does not exist - run', sys.argv[0], 'again!',
              file=sys.stderr)
        sys.exit(1)

    # First, fix dangling refs so that refs can be reliably listed again
    fix_dangling_refs(repo)

    # Next, traverse all commits to mark any as partial
    mark_commits_partial(repo)

    # Finally, try to completely pull in any partial referenced commits
    # so there are no longer any missing objects
    pull_partial_refs(repo)

    print('\nSuccess! Try to update the OS and Apps now.')


if __name__ == '__main__':
    main()
