#!/usr/bin/perl
# This program is open source, licensed under the PostgreSQL License.
# For license terms, see the LICENSE file.
#
# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault

=head1 NAME

ocf_heartbeat_pgsqlms - A PostgreSQL multi-state resource agent for Pacemaker

=head1 SYNOPSIS

B<pgsqlms> [start | stop | monitor | promote | demote | notify | reload | methods | meta-data | validate-all]

=head1 DESCRIPTION

Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource.

=cut

use strict;
use warnings;
use 5.008;

use POSIX qw(locale_h);
use Scalar::Util qw(looks_like_number);
use File::Spec;
use File::Temp;
use Data::Dumper;

use FindBin;
use lib "$FindBin::RealBin/../lib/";
use lib "$FindBin::RealBin/../../lib/heartbeat/";

use OCF_ReturnCodes;
use OCF_Directories;
use OCF_Functions;

our $VERSION = 'v2.3.0';
our $PROGRAM = 'pgsqlms';

# OCF environment
my $OCF_RESOURCE_INSTANCE = $ENV{'OCF_RESOURCE_INSTANCE'};
my $OCF_RUNNING_SLAVE     = $OCF_SUCCESS;
my %OCF_NOTIFY_ENV        = ocf_notify_env() if $__OCF_ACTION eq 'notify';

# Default parameters values
my $system_user_default = "postgres";
my $bindir_default      = "/usr/bin";
my $pgdata_default      = "/var/lib/pgsql/data";
my $pghost_default      = "/tmp";
my $pgport_default      = 5432;
my $start_opts_default  = "";
my $maxlag_default      = "0";

# Set default values if not found in environment
my $system_user  = $ENV{'OCF_RESKEY_system_user'} || $system_user_default;
my $bindir       = $ENV{'OCF_RESKEY_bindir'} || $bindir_default;
my $pgdata       = $ENV{'OCF_RESKEY_pgdata'} || $pgdata_default;
my $datadir      = $ENV{'OCF_RESKEY_datadir'} || $pgdata;
my $pghost       = $ENV{'OCF_RESKEY_pghost'} || $pghost_default;
my $pgport       = $ENV{'OCF_RESKEY_pgport'} || $pgport_default;
my $start_opts   = $ENV{'OCF_RESKEY_start_opts'} || $start_opts_default;
my $maxlag       = $ENV{'OCF_RESKEY_maxlag'} || $maxlag_default;
my $recovery_tpl = $ENV{'OCF_RESKEY_recovery_template'}
    || "$pgdata/recovery.conf.pcmk";


# PostgreSQL commands path
my $POSTGRES   = "$bindir/postgres";
my $PGCTL      = "$bindir/pg_ctl";
my $PGPSQL     = "$bindir/psql";
my $PGCTRLDATA = "$bindir/pg_controldata";
my $PGISREADY  = "$bindir/pg_isready";
my $PGWALDUMP  = "$bindir/pg_waldump";

# pacemaker commands path
my $CRM_MASTER    = "$HA_SBIN_DIR/crm_master --lifetime forever";
my $CRM_NODE      = "$HA_SBIN_DIR/crm_node";
my $CRM_RESOURCE  = "$HA_SBIN_DIR/crm_resource";
my $ATTRD_PRIV    = "$HA_SBIN_DIR/attrd_updater --private --lifetime reboot";

# Global vars
my $nodename;
my $exit_code = 0;
# numeric pgsql versions
my $PGVERNUM;
my $PGVER_93 = 90300;
my $PGVER_10 = 100000;
my $PGVER_12 = 120000;

# Run a query using psql.
#
# This function returns an array with psql return code as first element and
# the result as second one.
#
sub _query {
    my $query        = shift;
    my $res          = shift;
    my $connstr      = "dbname=postgres";
    my $RS           = chr(30); # ASCII RS  (record separator)
    my $FS           = chr(3);  # ASCII ETX (end of text)
    my $postgres_uid = getpwnam( $system_user );
    my $oldeuid      = $>;
    my $tmpfile;
    my @res;
    my $ans;
    my $pid;
    my $rc;

    unless ( defined $res and defined $query and $query ne '' ) {
        ocf_log( 'debug', '_query: wrong parameters!' );
        return -1;
    }

    unless ( $tmpfile = File::Temp->new(
            TEMPLATE => 'pgsqlms-XXXXXXXX',
            DIR      => $HA_RSCTMP
        ) )
    {
        ocf_exit_reason( 'Could not create or write in a temp file' );
        exit $OCF_ERR_INSTALLED;
    }

    print $tmpfile $query;
    chmod 0644, $tmpfile;

    ocf_log( 'debug', '_query: %s', $query );

    # Change the effective user to the given system_user so after forking
    # the given uid to the process should allow psql to connect w/o password
    $> = $postgres_uid;

    # Forking + piping
    $pid = open(my $KID, "-|");

    if ( $pid == 0 ) { # child
        exec $PGPSQL, '--set', 'ON_ERROR_STOP=1', '-qXAtf', $tmpfile,
            '-R', $RS, '-F', $FS, '--port', $pgport, '--host', $pghost,
            $connstr;
    }

    # parent
    $> = $oldeuid;

    {
        local $/;
        $ans = <$KID>;
    }

    close $KID;
    $rc = $? >> 8;

    ocf_log( 'debug', '_query: psql return code: %d', $rc );

    if ( defined $ans ) {
        chop $ans;

        push @{ $res }, [ split(chr(3) => $_, -1) ]
            foreach split (chr(30) => $ans, -1);

        ocf_log( 'debug', '_query: @res: %s',
            Data::Dumper->new( [ $res ] )->Terse(1)->Dump );
    }

    # Possible return codes:
    #  -1: wrong parameters
    #   0: OK
    #   1: failed to get resources (memory, missing file, ...)
    #   2: unable to connect
    #   3: query failed
    return $rc;
}

# Get the last received location on a standby
# if the first argument is true, returns the value as decimal
# if the first argument is false, returns the value as LSN
# Returns undef if query failed
sub _get_last_received_lsn {
    my ( $dec ) = @_;
    my $pg_last_wal_receive_lsn = 'pg_last_wal_receive_lsn()';
    my $pg_wal_lsn_diff         = 'pg_wal_lsn_diff';
    my $query;
    my $rc;
    my @rs;

    if ( $PGVERNUM < $PGVER_10  ) {
        $pg_last_wal_receive_lsn = 'pg_last_xlog_receive_location()';
        $pg_wal_lsn_diff         = 'pg_xlog_location_diff';
    }

    if ( $dec ) {
        $query = "SELECT $pg_wal_lsn_diff( $pg_last_wal_receive_lsn, '0/0' )";
    }
    else {
        $query = "SELECT $pg_last_wal_receive_lsn";
    }

    $rc = _query( $query, \@rs );

    return $rs[0][0] if $rc == 0 and $rs[0][0];

    ocf_log( 'err', 'Could not query last received LSN (%s)', $rc ) if $rc != 0;
    ocf_log( 'err', 'No values for last received LSN' )
        if $rc == 0 and not $rs[0][0];

    return undef;
}

# Get the master score for each connected standby
# Returns directly the result set of the query or exit with an error.
# Exits with OCF_ERR_GENERIC if the query failed
sub _get_lag_scores {
    my $pg_current_wal_lsn = 'pg_current_wal_lsn()';
    my $pg_wal_lsn_diff    = 'pg_wal_lsn_diff';
    my $write_lsn          = 'write_lsn';
    my $query;
    my $rc;
    my @rs;

    if ( $PGVERNUM < $PGVER_10  ) {
        $pg_current_wal_lsn = 'pg_current_xlog_location()';
        $pg_wal_lsn_diff    = 'pg_xlog_location_diff';
        $write_lsn          = 'write_location';
    }

    # We check locations of connected standbies by querying the
    # "pg_stat_replication" view.
    # The row_number applies on the result set ordered on write_location ASC so
    # the highest row_number should be given to the closest node from the
    # master, then the lowest node name (alphanumeric sort) in case of equality.
    # The result set itself is order by priority DESC to process best known
    # candidate first.
    $query = qq{
      SELECT application_name, priority, location, state, current_lag
      FROM (
        SELECT application_name,
          (1000 - (
            row_number() OVER (
              PARTITION BY state IN ('startup', 'backup')
              ORDER BY location ASC, application_name ASC
            ) - 1
           ) * 10
          ) * CASE WHEN ( $maxlag > 0
                     AND current_lag > $maxlag)
                        THEN -1
                   ELSE 1
              END AS priority,
          location, state, current_lag
        FROM (
          SELECT application_name, $write_lsn AS location, state,
            $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
          FROM pg_stat_replication
        ) AS s2
      ) AS s1
      ORDER BY priority DESC
    };

    $rc = _query( $query, \@rs );

    if ( $rc != 0 ) {
        ocf_exit_reason( 'Query to get standby locations failed (%d)', $rc );
        exit $OCF_ERR_GENERIC;
    }

    return \@rs;
}

# get the timeout for the current action given from environment var
# Returns   timeout as integer
#           undef if unknown
sub _get_action_timeout {
    my $timeout = $ENV{'OCF_RESKEY_CRM_meta_timeout'} / 1000;

    ocf_log( 'debug', '_get_action_timeout: known timeout: %s',
        defined $timeout ? $timeout : 'undef' );

    return $timeout if defined $timeout and $timeout =~ /^\d+$/;

    return undef;
}

# Get, parse and return the value of the given private attribute name
# Returns an empty string if not found.
sub _get_priv_attr {
    my ( $name, $node ) = @_;
    my $val             = '';
    my $node_arg        = '';
    my $ans;

    $node = '' unless defined $node;
    $name = "$name-$OCF_RESOURCE_INSTANCE";

    $node_arg= "--node $node" if $node ne '';

    $ans = qx{ $ATTRD_PRIV --name "$name" --query $node_arg };

    $ans =~ m/^name=".*" host=".*" value="(.*)"$/;

    $val = $1 if defined $1;

    ocf_log( 'debug', '_get_priv_attr: value of "%s"%s is "%s"', $name,
        ( $node ? " on \"$node\"": ""),
        $val );

    return $val;
}

# Set the given private attribute name to the given value
# As setting an attribute is asynchronous, this will return as soon as the
# attribute is really set by attrd and available.
sub _set_priv_attr {
    my ( $name, $val ) = @_;
    my $name_instance  = "$name-$OCF_RESOURCE_INSTANCE";

    ocf_log( 'debug', '_set_priv_attr: set "%s=%s"...', $name_instance, $val );

    qx{ $ATTRD_PRIV --name "$name_instance" --update "$val" };

    # give attr name without the resource instance name as _get_priv_attr adds
    # it as well
    while ( _get_priv_attr( $name ) ne $val ) {
        ocf_log( 'debug', '_set_priv_attr: waiting attrd ack for "%s"...', $name_instance );
        select( undef, undef, undef, 0.1 );
    }

    return;
}

# Delete the given private attribute.
# As setting an attribute is asynchronous, this will return as soon as the
# attribute is really deleted by attrd.
sub _delete_priv_attr {
    my ( $name ) = @_;
    my $name_instance  = "$name-$OCF_RESOURCE_INSTANCE";

    ocf_log( 'debug', '_delete_priv_attr: delete "%s"...', $name_instance );

    qx{ $ATTRD_PRIV --name "$name_instance" --delete };

    # give attr name without the resource instance name as _get_priv_attr adds
    # it as well
    while ( _get_priv_attr( $name ) ne '' ) {
        ocf_log( 'debug', '_delete_priv_attr: waiting attrd ack for "%s"...',
            $name_instance );
        select( undef, undef, undef, 0.1 );
    }

    return;
}

# Get, parse and return the resource master score on given node.
# Returns an empty string if not found.
# Returns undef on crm_master call on error
sub _get_master_score {
    my ( $node ) = @_;
    my $node_arg = '';
    my $score;

    $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne '';

    $score = qx{ $CRM_MASTER --quiet --get-value $node_arg 2> /dev/null };

    return '' unless $? == 0 and defined $score;

    chomp $score;

    return $score;
}

# Set the master score of the local node or the optionally given node.
# As setting an attribute is asynchronous, this will return as soon as the
# attribute is really set by attrd and available everywhere.
sub _set_master_score {
    my ( $score, $node ) = @_;
    my $node_arg = '';
    my $tmp;

    $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne '';

    qx{ $CRM_MASTER $node_arg --quiet --update "$score" };

    while ( ( $tmp = _get_master_score( $node ) ) ne $score ) {
        ocf_log( 'debug',
            '_set_master_score: waiting to set score to "%s" (currently "%s")...',
            $score, $tmp );
        select(undef, undef, undef, 0.1);
    }

    return;
}

# _master_score_exists
# This subroutine checks if a master score is set for one of the relative clones
# in the cluster and the score is greater or equal of 0.
# Returns 1 if at least one master score >= 0 is found.
# Returns 0 otherwise
sub _master_score_exists {
    my @partition_nodes = split /\s+/ => qx{ $CRM_NODE --partition };

    foreach my $node ( @partition_nodes ) {
        my $score = _get_master_score( $node );

        return 1 if defined $score and $score ne '' and $score > -1;
    }

    return 0;
}

# Check if the current transiation is a recover of a master clone on given node.
sub _is_master_recover {
    my ( $n ) = @_;

    return (
            scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'master'} }
        and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} }
    );
}

# Check if the current transition is a recover of a slave clone on given node.
sub _is_slave_recover {
    my ( $n ) = @_;

    return (
            scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} }
        and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'start'} }
    );
}

# check if th current transition is a switchover to the given node.
sub _is_switchover {
    my ( $n ) = @_;
    my $old = $OCF_NOTIFY_ENV{'master'}[0]{'uname'};

    return 0 if scalar @{ $OCF_NOTIFY_ENV{'master'} }  != 1
             or scalar @{ $OCF_NOTIFY_ENV{'demote'} }  != 1
             or scalar @{ $OCF_NOTIFY_ENV{'promote'} } != 1;

    return (
           scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'demote'} }
       and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} }
       and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} }
       and not scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'stop'} }
    );
}

# Run the given command as the "system_user" given as parameter.
# It basically forks and seteuid/setuid away from root.
#
sub _runas {
    my $rc;
    my $pid;
    my @cmd = @_;
    my (undef, undef, $postgres_uid, $postgres_gid ) = getpwnam( $system_user );

    $pid = fork;

    if ( $pid == 0 ) { # in child
        $) = "$postgres_gid $postgres_gid";
        while ( my ( undef, undef, $gid, $members ) = getgrent ) {
            $) .= " $gid" if grep { $system_user eq $_ } split /\s+/, $members
        }
        $( = $postgres_gid;

        $< = $> = $postgres_uid;

        exec @cmd;
    }

    ocf_log( 'debug', '_runas: launching as "%s" command "%s"', $system_user,
        join(' ', @cmd) );

    waitpid $pid, 0;
    $rc = $? >> 8;

    return $rc;
}

# Check if instance is listening on the given host/port.
#
sub _pg_isready {
    # Add 60s to the timeout or use a 24h timeout fallback to make sure
    # Pacemaker will give up before us and take decisions
    my $timeout = ( _get_action_timeout() || 60*60*24 )  + 60;
    my $rc = _runas( $PGISREADY, '-h', $pghost, '-p', $pgport, '-d', 'postgres', '-t', $timeout );

    # Possible error codes:
    #   1: ping rejected (usually when instance is in startup, in crash
    #      recovery, in warm standby, or when a shutdown is in progress)
    #   2: no response, usually means the instance is down
    #   3: no attempt, probably a syntax error, should not happen
    return $rc;
}

# Check the postmaster.pid file and the postmaster process.
# WARNING: we do not distinguish the scenario where postmaster.pid does not
# exist from the scenario where the process is still alive. It should be ok
# though, as this is considered a hard error from monitor.
#
sub _pg_ctl_status {
    my $rc = _runas( $PGCTL, '--pgdata', $pgdata, 'status' );

    # Possible error codes:
    #   3: postmaster.pid file does not exist OR it does but the process
    #      with the PID found in the file is not alive
    return $rc;
}

# Start the local instance using pg_ctl
#
sub _pg_ctl_start {
    # Add 60s to the timeout or use a 24h timeout fallback to make sure
    # Pacemaker will give up before us and take decisions
    my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60;

    my @cmd = ( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout, 'start' );

    push @cmd => ( '-o', $start_opts ) if $start_opts ne '';

    return _runas( @cmd );
}

# Enable the Standby mode.
#
# Up to v11, creates the recovery.conf file based on the given template.
# Since v12, creates standby.signal.
sub _enable_recovery {
    my $fh;
    my $content      = '';
    my $standby_file = "$datadir/standby.signal";
    my (undef, undef, $uid, $gid) = getpwnam($system_user);

    if ( $PGVERNUM < $PGVER_12 ) {
        $standby_file = "$datadir/recovery.conf";

        ocf_log( 'debug',
            '_enable_recovery: get replication configuration from the template file "%s"',
            $recovery_tpl );

        # Create the recovery.conf file to start the instance as a secondary.
        # NOTE: the recovery.conf is supposed to be set up so the secondary can
        # connect to the primary instance, eg. using a virtual IP address.
        # As there is no primary instance available at startup, secondaries will
        # complain about failing to connect.
        # As we can not reload a recovery.conf file on a standby without restarting
        # it, we will leave with this.
        # FIXME how would the reload help us in this case ?
        unless ( defined open( $fh, '<', $recovery_tpl ) ) {
            ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! );
            exit $OCF_ERR_CONFIGURED;
        }

        # Copy all parameters from the template file
        while (my $line = <$fh>) {
            chomp $line;
            $content .= "$line\n";
        }
        close $fh;
    }

    ocf_log( 'debug', '_enable_recovery: write the standby file "%s"', $standby_file );

    unless ( open( $fh, '>', $standby_file ) ) {
        ocf_exit_reason( 'Could not open file "%s": %s', $standby_file, $! );
        exit $OCF_ERR_CONFIGURED;
    }

    # Write the recovery.conf file using configuration from the template file
    print $fh $content;

    close $fh;

    unless ( chown $uid, $gid, $standby_file ) {
        ocf_exit_reason( 'Could not set owner of "%s"', $standby_file );
        exit $OCF_ERR_CONFIGURED;
    };
}

# Parse and return various informations about the local PostgreSQL instance as
# reported by its controldata file.
#
# WARNING: the status is NOT updated in case of crash.
#
# This sub exit the script with an error on failure
sub _get_controldata {
    my %controldata;
    my $ans;

    $ans = qx{ $PGCTRLDATA "$datadir" 2>/dev/null };

    # Parse the output of pg_controldata.
    # This output is quite stable between pg versions, but we might need to sort
    # it at some point if things are moving in there...
    $ans =~ m{
        # get the current state
        ^\QDatabase cluster state\E:\s+(.*?)\s*$
        .*
        # Get the latest known REDO location
        ^\QLatest checkpoint's REDO location\E:\s+([/0-9A-F]+)\s*$
        .*
        # Get the latest known TL
        ^\QLatest checkpoint's TimeLineID\E:\s+(\d+)\s*$
        .*
        # Get the wal level
        # NOTE: pg_controldata output changed with PostgreSQL 9.5, so we need to
        # account for both syntaxes
        ^(?:\QCurrent \E)?\Qwal_level setting\E:\s+(.*?)\s*$
    }smx;

    $controldata{'state'}     = $1 if defined $1;
    $controldata{'redo'}      = $2 if defined $2;
    $controldata{'tl'}        = $3 if defined $3;
    $controldata{'wal_level'} = $4 if defined $4;

    ocf_log( 'debug',
        "_get_controldata: found: %s",
        Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump );

    return %controldata if defined $controldata{'state'}
                        and defined $controldata{'tl'}
                        and defined $controldata{'redo'}
                        and defined $controldata{'wal_level'};

    ocf_exit_reason( 'Could not read all datas from controldata file for "%s"',
        $datadir );

    ocf_log( 'debug',
        "_get_controldata: controldata file: %s",
        Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump, $ans );

    exit $OCF_ERR_ARGS;
}

# Pead major version from datadir/PG_VERSION and return it as numeric version
sub _get_pg_version {
    my $fh;
    my $PGVERSION;
    my $PGVERNUM;

    # check PG_VERSION
    if ( ! -s "$datadir/PG_VERSION" ) {
        ocf_exit_reason( 'PG_VERSION does not exist in "%s"', $datadir );
        exit $OCF_ERR_ARGS;
    }

    unless ( open( $fh, '<', "$datadir/PG_VERSION" ) ) {
        ocf_exit_reason( "Could not open file \"$datadir/PG_VERSION\": $!" );
        exit $OCF_ERR_ARGS;
    }

    read( $fh, $PGVERSION, 32 );
    close $fh;

    chomp $PGVERSION;

    $PGVERSION =~ /^(\d+)(?:\.(\d+))?$/;
    $PGVERNUM  = $1 * 10000;
    $PGVERNUM += $2 * 100 if $1 < 10; # no 2nd num in the major version from v10

    return $PGVERNUM;
}

# Use pg_controldata to check the state of the PostgreSQL server. This
# function returns codes depending on this state, so we can find whether the
# instance is a primary or a secondary, or use it to detect any inconsistency
# that could indicate the instance has crashed.
#
sub _controldata_to_ocf {
    my %cdata = _get_controldata();

    while ( 1 ) {
        ocf_log( 'debug', '_controldata: instance "%s" state is "%s"',
            $OCF_RESOURCE_INSTANCE, $cdata{'state'} );

        # Instance should be running as a primary.
        return $OCF_RUNNING_MASTER if $cdata{'state'} eq "in production";

        # Instance should be running as a secondary.
        # This state includes warm standby (rejects connections attempts,
        # including pg_isready)
        return $OCF_SUCCESS if $cdata{'state'} eq "in archive recovery";


        # The instance should be stopped.
        # We don't care if it was a primary or secondary before, because we
        # always start instances as secondaries, and then promote if necessary.
        return $OCF_NOT_RUNNING if $cdata{'state'} eq "shut down"
            or $cdata{'state'} eq "shut down in recovery";

        # The state is "in crash recovery", "starting up" or "shutting down".
        # This state should be transitional, so we wait and loop to check if
        # it changes.
        # If it does not, pacemaker will eventually abort with a timeout.
        ocf_log( 'debug',
            '_controldata: waiting for transitionnal state "%s" to finish',
            $cdata{'state'} );
        sleep 1;
        %cdata = _get_controldata();
    }

    # If we reach this point, something went really wrong with this code or
    # pg_controldata.
    ocf_exit_reason( 'Unable get instance "%s" state using pg_controldata',
        $OCF_RESOURCE_INSTANCE );

    return $OCF_ERR_INSTALLED ;
}

# Check the write_location of all secondaries, and adapt their master score so
# that the instance closest to the master will be the selected candidate should
# a promotion be triggered.
# NOTE: This is only a hint to pacemaker! The selected candidate to promotion
# actually re-check it is the best candidate and force a re-election by failing
# if a better one exists. This avoid a race condition between the call of the
# monitor action and the promotion where another slave might have catchup faster
# with the master.
# NOTE: we cannot directly use the write_location, neither a lsn_diff value as
# promotion score as Pacemaker considers any value greater than 1,000,000 as
# INFINITY.
#
# This sub is supposed to be executed from a master monitor action.
#
sub _check_locations {
    my $node_score;
    my $row_num;
    my $row;
    my @rs;

    # Call crm_node to exclude nodes that are not part of the cluster at this
    # point.
    my $partition_nodes = qx{ $CRM_NODE --partition };

    @rs = @{ _get_lag_scores() };

    $row_num = scalar @rs;

    # If there is no row left at this point, it means that there is no
    # secondary instance connected.
    ocf_log( 'warning', 'No secondary connected to the master' )
        if $row_num == 0;

    # For each standby connected, set their master score based on the following
    # rule: the first known node/application, with the highest priority and
    # with an acceptable state.
    while ( $row = shift @rs ) {

        if ( $partition_nodes !~ /$row->[0]/ ) {
            ocf_log( 'info', 'Ignoring unknown application_name/node "%s"',
                $row->[0] );
            next;
        }

        if ( $row->[0] eq $nodename ) {
            ocf_log( 'warning', 'Streaming replication with myself!' );
            next;
        }

        $node_score = _get_master_score( $row->[0] );

        if ( $row->[3] =~ /^\s*(?:startup|backup)\s*$/ ) {
            # We exclude any standby being in state backup (pg_basebackup) or
            # startup (new standby or failing standby)
            ocf_log( 'info', 'Forbidding promotion on "%s" in state "%s"',
                $row->[0], $row->[3] );

            _set_master_score( '-1', $row->[0] ) unless $node_score eq '-1';
        }
        else {
            ocf_log( 'debug',
                '_check_locations: checking "%s" promotion ability (current_score: %s, priority: %s, location: %s, lag: %s)',
                $row->[0], $node_score, $row->[1], $row->[2], $row->[4] );

            if ( $node_score ne $row->[1] ) {
                if ( $row->[1] < -1 ) {
                    ocf_log( 'info', 'Update score of "%s" from %s to %s because replication lag (%s) is higher than given maxlag (%s).',
                        $row->[0], $node_score, $row->[1], $row->[4], $maxlag );
                }
                else {
                    ocf_log( 'info', 'Update score of "%s" from %s to %s because of a change in the replication lag (%s).',
                        $row->[0], $node_score, $row->[1], $row->[4] );
                }
                _set_master_score( $row->[1], $row->[0] );
            }
            else {
                ocf_log( 'debug',
                    '_check_locations: "%s" keeps its current score of %s',
                    $row->[0], $row->[1] );
            }
        }

        # Remove this node from the known nodes list.
        $partition_nodes =~ s/(?:^|\s)$row->[0](?:\s|$)/ /g;
    }

    $partition_nodes =~ s/(?:^\s+)|(?:\s+$)//g;

    # If there are still nodes in "partition_nodes", it means there is no
    # corresponding line in "pg_stat_replication".
    foreach my $node (split /\s+/ => $partition_nodes) {
        # Exclude the current node.
        next if $node eq $nodename;

        # do not warn if the master score is already set to -1000.
        # this avoid log flooding (gh #138)
        $node_score = _get_master_score( $node );
        next if $node_score eq '-1000';

        ocf_log( 'warning', '"%s" is not connected to the primary', $node );
        _set_master_score( '-1000', $node );
    }

    # Finally set the master score if not already done
    $node_score = _get_master_score();
    _set_master_score( '1001' ) unless $node_score eq '1001';

    return $OCF_SUCCESS;
}

# _check_switchover
# check if the pgsql switchover to the localnode is safe.
# This is supposed to be called **after** the master has been stopped or demoted.
# This sub checks if the local standby received the shutdown checkpoint from the
# old master to make sure it can take over the master role and the old master
# will be able to catchup as a standby after.
#
# Returns 0 if switchover is safe
# Returns 1 if swithcover is not safe
# Returns 2 for internal error
sub _check_switchover {
    my $has_sht_chk = 0;
    my $last_redo;
    my $last_lsn;
    my $ans;
    my $rc;
    my $tl;
    my %cdata;

    $PGWALDUMP = "$bindir/pg_xlogdump" if $PGVERNUM < $PGVER_10;

    ocf_log( 'info', 'Switchover in progress from "%s" to "%s".'
        .' Need to check the last record in WAL',
        $OCF_NOTIFY_ENV{'demote'}[0]{'uname'}, $nodename );

    # check if we received the shutdown checkpoint of the master during its
    # demote process.
    # We need the last local checkpoint LSN and the last received LSN from
    # master to check in the WAL between these adresses if we have a
    # "checkpoint shutdown" using pg_xlogdump/pg_waldump.
    #
    # Force a checkpoint to make sure the controldata shows the very last TL
    # and the master's shutdown checkpoint
    _query( q{ CHECKPOINT }, {} );
    %cdata     = _get_controldata();
    $tl        = $cdata{'tl'};
    $last_redo = $cdata{'redo'};

    # Get the last received LSN from master
    $last_lsn = _get_last_received_lsn();

    unless ( defined $last_lsn ) {
        ocf_exit_reason( 'Could not fetch last received LSN!' );

        return 2;
    }

    $ans = qx{ $PGWALDUMP --path "$datadir" --timeline "$tl" \\
               --start "$last_redo" --end "$last_lsn" 2>&1 };
    $rc = $?;

    ocf_log( 'debug',
        '_check_switchover: %s rc: "%s", tl: "%s", last_chk: %s, last_lsn: %s, output: "%s"',
        $PGWALDUMP, $rc, $tl, $last_redo, $last_lsn, $ans
    );

    if ( $rc == 0 and
         $ans =~ m{^rmgr: XLOG.*desc: (?i:checkpoint)(?::|_SHUTDOWN) redo [0-9A-F/]+; tli $tl;.*; shutdown$}m
    ) {
        ocf_log( 'info', 'Slave received the shutdown checkpoint' );
        return 0;
    }

    ocf_exit_reason(
        'Did not receive the shutdown checkpoint from the old master!' );

    return 1;
}

# Check to confirm if the instance is really started as _pg_isready stated and
# check if the instance is primary or secondary.
#
sub _confirm_role {
    my $is_in_recovery;
    my $rc;
    my @rs;

    $rc = _query( "SELECT pg_is_in_recovery()", \@rs );

    $is_in_recovery = $rs[0][0];

    if ( $rc == 0 ) {
        # The query was executed, check the result.
        if ( $is_in_recovery eq 't' ) {
            # The instance is a secondary.
            ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a secondary");
            return $OCF_SUCCESS;
        }
        elsif ( $is_in_recovery eq 'f' ) {
            # The instance is a primary.
            ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a primary");
            # Check lsn diff with current slaves if any
            _check_locations() if $__OCF_ACTION eq 'monitor';
            return $OCF_RUNNING_MASTER;
        }

        # This should not happen, raise a hard configuration error.
        ocf_exit_reason(
            'Unexpected result from query to check if "%s" is a primary or a secondary: "%s"',
            $OCF_RESOURCE_INSTANCE, $is_in_recovery );

        return $OCF_ERR_CONFIGURED;
    }
    elsif ( $rc == 1 or $rc == 2 ) {
        # psql cound not connect to the instance.
        # As pg_isready reported the instance was listening, this error
        # could be a max_connection saturation. Just report a soft error.
        ocf_exit_reason( 'psql could not connect to instance "%s"',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_ERR_GENERIC;
    }

    # The query failed (rc: 3) or bad parameters (rc: -1).
    # This should not happen, raise a hard configuration error.
    ocf_exit_reason(
        'The query to check if instance "%s" is a primary or a secondary failed (rc: %d)',
        $OCF_RESOURCE_INSTANCE, $rc );

    return $OCF_ERR_CONFIGURED;
}


# Check to confirm if the instance is really stopped as _pg_isready stated
# and if it was propertly shut down.
#
sub _confirm_stopped {
    my $pgctlstatus_rc;
    my $controldata_rc;

    # Check the postmaster process status.
    $pgctlstatus_rc = _pg_ctl_status();

    if ( $pgctlstatus_rc == 0 ) {
        # The PID file exists and the process is available.
        # That should not be the case, return an error.
        ocf_exit_reason(
            'Instance "%s" is not listening, but the process referenced in postmaster.pid exists',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_ERR_GENERIC;
    }

    # The PID file does not exist or the process is not available.
    ocf_log( 'debug',
        '_confirm_stopped: no postmaster process found for instance "%s"',
        $OCF_RESOURCE_INSTANCE );

    if ( -f "$datadir/backup_label" ) {
        # We are probably on a freshly built secondary that was not started yet.
        ocf_log( 'debug',
            '_confirm_stopped: backup_label file exists: probably on a never started secondary',
        );
        return $OCF_NOT_RUNNING;
    }

    # Continue the check with pg_controldata.
    $controldata_rc = _controldata_to_ocf();
    if ( $controldata_rc == $OCF_RUNNING_MASTER ) {
        # The controldata has not been updated to "shutdown".
        # It should mean we had a crash on a primary instance.
        ocf_exit_reason(
            'Instance "%s" controldata indicates a running primary instance, the instance has probably crashed',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_FAILED_MASTER;
    }
    elsif ( $controldata_rc == $OCF_SUCCESS ) {
        # The controldata has not been updated to "shutdown in recovery".
        # It should mean we had a crash on a secondary instance.
        # There is no "FAILED_SLAVE" return code, so we return a generic error.
        ocf_exit_reason(
            'Instance "%s" controldata indicates a running secondary instance, the instance has probably crashed',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_ERR_GENERIC;
    }
    elsif ( $controldata_rc == $OCF_NOT_RUNNING ) {
        # The controldata state is consistent, the instance was probably
        # propertly shut down.
        ocf_log( 'debug',
            '_confirm_stopped: instance "%s" controldata indicates that the instance was propertly shut down',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_NOT_RUNNING;
    }

    # Something went wrong with the controldata check.
    ocf_exit_reason(
        'Could not get instance "%s" status from controldata (returned: %d)',
        $OCF_RESOURCE_INSTANCE, $controldata_rc );

    return $OCF_ERR_GENERIC;
}

############################################################
#### OCF FUNCS



=head1 SUPPORTED PARAMETERS

=over

=item B<pgdata>

Location of the PGDATA of your instance

(optional, string, default "/var/lib/pgsql/data")

=item B<pghost>

The socket directory or IP address to use to connect to the local instance

(optional, string, default "/tmp")

=item B<pgport>

The port to connect to the local instance

(optional, integer, default "5432")

=item B<bindir>

Location of the PostgreSQL binaries.

(optional, string, default "/usr/bin")

=item B<system_user>

The system owner of your instance's process

(optional, string, default "postgres")

=item B<recovery_template>

B<ONLY> for PostgreSQL 11 and bellow.

The local template that will be copied as the C<PGDATA/recovery.conf> file.
This template file must exists on all node.

With PostgreSQL 12 and higher, the cluster will refuse to start if this
parameter is set or a template file is found.

(optional, string, default "$PGDATA/recovery.conf.pcmk")

=item B<maxlag>

Maximum lag allowed on a standby before we set a negative master score on it.
The calculation is based on the difference between the current xlog location on
the master and the write location on the standby.

(optional, integer, default "0" disables this feature)

=item B<datadir>

Path to the directory set in C<data_directory> from your postgresql.conf file.
This parameter has same default than PostgreSQL itself: the C<pgdata> parameter
value.

Unless you have a special PostgreSQL setup and you understand this parameter,
B<ignore it>

(optional, string, default to the value of C<pgdata>)

=item B<start_opts>

Additional arguments given to the postgres process on startup. See
"postgres --help" for available options. Useful when the postgresql.conf file
is not in the data directory (PGDATA), eg.:

  -c config_file=/etc/postgresql/9.3/main/postgresql.conf

(optinal, string, default "")

=back

=cut

sub ocf_meta_data {
    print qq{<?xml version="1.0"?>
        <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
        <resource-agent name="pgsqlms">
          <version>1.0</version>

          <longdesc lang="en">
            Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource.
          </longdesc>
          <shortdesc lang="en">Manages PostgreSQL servers in replication</shortdesc>
          <parameters>
            <parameter name="system_user" unique="0" required="0">
              <longdesc lang="en">
                System user account used to run the PostgreSQL server
              </longdesc>
              <shortdesc lang="en">PostgreSQL system User</shortdesc>
              <content type="string" default="$system_user_default" />
            </parameter>

            <parameter name="bindir" unique="0" required="0">
              <longdesc lang="en">
                Path to the directory storing the PostgreSQL binaries. The agent uses psql, pg_isready, pg_controldata and pg_ctl.
              </longdesc>
              <shortdesc lang="en">Path to the PostgreSQL binaries</shortdesc>
              <content type="string" default="$bindir_default" />
            </parameter>

            <parameter name="pgdata" unique="1" required="0">
              <longdesc lang="en">
                Path to the data directory, e.g. PGDATA
              </longdesc>
              <shortdesc lang="en">Path to the data directory</shortdesc>
              <content type="string" default="$pgdata_default" />
            </parameter>

            <parameter name="datadir" unique="1" required="0">
              <longdesc lang="en">
                Path to the directory set in data_directory from your postgresql.conf file. This parameter
                has the same default than PostgreSQL itself: the pgdata parameter value. Unless you have a
                special PostgreSQL setup and you understand this parameter, ignore it.
              </longdesc>
              <shortdesc lang="en">Path to the directory set in data_directory from your postgresql.conf file</shortdesc>
              <content type="string" default="PGDATA" />
            </parameter>

            <parameter name="pghost" unique="0" required="0">
              <longdesc lang="en">
                Host IP address or unix socket folder the instance is listening on.
              </longdesc>
              <shortdesc lang="en">Instance IP or unix socket folder</shortdesc>
              <content type="string" default="$pghost_default" />
            </parameter>

            <parameter name="pgport" unique="0" required="0">
              <longdesc lang="en">
                Port the instance is listening on.
              </longdesc>
              <shortdesc lang="en">Instance port</shortdesc>
              <content type="integer" default="$pgport_default" />
            </parameter>

           <parameter name="maxlag" unique="0" required="0">
              <longdesc lang="en">
                Maximum lag allowed on a standby before we set a negative master score on it. The calculation
                is based on the difference between the current LSN on the master and the LSN
                written on the standby.
                This parameter must be a valid positive number as described in PostgreSQL documentation.
                See: https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
              </longdesc>
              <shortdesc lang="en">Maximum write lag before we mark a standby as inappropriate to promote</shortdesc>
              <content type="integer" default="$maxlag_default" />
            </parameter>

            <parameter name="recovery_template" unique="1" required="0">
              <longdesc lang="en">
                Path to the recovery.conf template. This file is simply copied to \$PGDATA
                before starting the instance as slave.
                ONLY for PostgreSQL 11 and bellow. This parameter is IGNORED for
                PostgreSQL 12 and higher. The cluster will refuse to start if a template
                file is found.
              </longdesc>
              <shortdesc lang="en">Path to the recovery.conf template for PostgreSQL 11 and older.</shortdesc>
              <content type="string" default="PGDATA/recovery.conf.pcmk" />
            </parameter>

            <parameter name="start_opts" unique="0" required="0">
              <longdesc lang="en">
                Additionnal arguments given to the postgres process on startup.
                See "postgres --help" for available options. Usefull when the
                postgresql.conf file is not in the data directory (PGDATA), eg.:
                "-c config_file=/etc/postgresql/9.3/main/postgresql.conf".
              </longdesc>
              <shortdesc lang="en">Additionnal arguments given to the postgres process on startup.</shortdesc>
              <content type="string" default="$start_opts_default" />
            </parameter>

          </parameters>
          <actions>
            <action name="start" timeout="60" />
            <action name="stop" timeout="60" />
            <action name="reload" timeout="20" />
            <action name="promote" timeout="30" />
            <action name="demote" timeout="120" />
            <action name="monitor" depth="0" timeout="10" interval="15"/>
            <action name="monitor" depth="0" timeout="10" interval="15" role="Master"/>
            <action name="monitor" depth="0" timeout="10" interval="16" role="Slave"/>
            <action name="notify" timeout="60" />
            <action name="meta-data" timeout="5" />
            <action name="validate-all" timeout="5" />
            <action name="methods" timeout="5" />
          </actions>
        </resource-agent>
    };
    return $OCF_SUCCESS;
}


=head1 SUPPORTED ACTIONS

This resource agent supports the following actions (operations):

=over

=item B<start>

Starts the resource. Suggested minimum timeout: 60.

=item B<stop>

Stops the resource. Suggested minimum timeout: 60.

=item B<reload>

Suggested minimum timeout: 20.

=item B<promote>

Promotes the resource to the Master role. Suggested minimum timeout: 30.

=item B<demote>

Demotes the resource to the Slave role. Suggested minimum timeout: 120.

=item B<monitor (Master role)>

Performs a detailed status check. Suggested minimum timeout: 10.
Suggested interval: 15.

=item B<monitor (Slave role)>

Performs a detailed status check. Suggested minimum timeout: 10.
Suggested interval: 16.

=item B<notify>

Suggested minimum timeout: 60

=item B<meta-data>

Retrieves resource agent metadata (internal use only).
Suggested minimum timeout: 5.

=item B<methods>

Suggested minimum timeout: 5.

=item B<validate-all>

Performs a validation of the resource configuration.
Suggested minimum timeout: 5.

=back

=cut

sub ocf_methods {
    print q{
        start
        stop
        reload
        promote
        demote
        monitor
        notify
        methods
        meta-data
        validate-all
    };

    return $OCF_SUCCESS;
}

############################################################
#### RA FUNCS

sub pgsql_validate_all {
    my $fh;
    my $ans = '';
    my %cdata;

    unless (
        ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.0.9' ) == 2
    ) {
        ocf_exit_reason(
            'PAF %s is compatible with Pacemaker 1.1.13 and greater',
            $VERSION
        );
        return $OCF_ERR_INSTALLED;
    }

    # check notify=true
    $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\
                 --meta --get-parameter notify 2>/dev/null };
    chomp $ans;
    unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) {
        ocf_exit_reason(
            'You must set meta parameter notify=true for your master resource'
        );
        return $OCF_ERR_INSTALLED;
    }

    # check master-max=1
    unless (
        defined $ENV{'OCF_RESKEY_CRM_meta_master_max'}
            and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1'
    ) {
        ocf_exit_reason(
            'You must set meta parameter master-max=1 for your master resource'
        );
        return $OCF_ERR_INSTALLED;
    }

    if ( $PGVERNUM >= $PGVER_12 ) {
        # check PostgreSQL setup: checks related to v12 and after
        my $guc;

        # recovery.conf template must not exists
        if ( -f $recovery_tpl ) {
            ocf_exit_reason(
                'Recovery template file "%s" is forbidden for PostgreSQL 12 and above',
                $recovery_tpl );
            exit $OCF_ERR_ARGS;
        }

        # WARNING: you MUST put -C as first argument to bypass the root check
        $guc = qx{ $POSTGRES -C recovery_target_timeline -D "$pgdata" $start_opts};
        chomp $guc;
        unless ( $guc eq 'latest' ) {
            ocf_exit_reason(
                q{Parameter "recovery_target_timeline" MUST be set to 'latest'. } .
                q{It is currently set to '%s'}, $guc );
            return $OCF_ERR_ARGS;
        }

        $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts};
        unless ($guc =~ /\bapplication_name=$nodename\b/) {
            ocf_exit_reason(
                q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }.
                q{It is currently set to '%s'}, $nodename, $guc );
            return $OCF_ERR_ARGS;
        }
    }
    else {
        my @content;

        # check recovery template
        if ( ! -f $recovery_tpl ) {
            ocf_exit_reason( 'Recovery template file "%s" does not exist',
                $recovery_tpl );
            return $OCF_ERR_ARGS;
        }

        # check content of the recovery template file
        unless ( open( $fh, '<', $recovery_tpl ) ) {
            ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! );
            return $OCF_ERR_ARGS;
        }
        @content = <$fh>;
        close $fh;


        unless ( grep /^\s*standby_mode\s*=\s*'?on'?\s*$/, @content ) {
            ocf_exit_reason(
                'Recovery template file must contain "standby_mode = on"' );
            return $OCF_ERR_ARGS;
        }

        unless ( grep /^\s*recovery_target_timeline\s*=\s*'?latest'?\s*$/, @content ) {
            ocf_exit_reason(
                "Recovery template file must contain \"recovery_target_timeline = 'latest'\""
            );
            return $OCF_ERR_ARGS;
        }

        unless (
            grep /^\s*primary_conninfo\s*=.*['\s]application_name=$nodename['\s]/,
            @content
        ) {
            ocf_exit_reason(
                'Recovery template file must contain in primary_conninfo parameter "application_name=%s"',
                $nodename );
            return $OCF_ERR_ARGS;
        }
    }

    unless ( looks_like_number($maxlag) ) {
        ocf_exit_reason( 'maxlag is not a number: "%s"', $maxlag );
        return $OCF_ERR_INSTALLED;
    }

    # check system user
    unless ( defined getpwnam $system_user ) {
        ocf_exit_reason( 'System user "%s" does not exist', $system_user );
        return $OCF_ERR_ARGS;
    }

    # require 9.3 minimum
    if ( $PGVERNUM < $PGVER_93 ) {
        ocf_exit_reason( "Require 9.3 and more" );
        return $OCF_ERR_INSTALLED;
    }

    # check binaries
    unless ( -x $PGCTL and -x $PGPSQL and -x $PGCTRLDATA and -x $PGISREADY
         and ( -x $PGWALDUMP or -x "$bindir/pg_xlogdump")
     ) {
        ocf_exit_reason(
            "Missing one or more binary. Check following path: %s, %s, %s, %s, %s or %s",
            $PGCTL, $PGPSQL, $PGCTRLDATA, $PGISREADY, $PGWALDUMP, "$bindir/pg_xlogdump" );
        return $OCF_ERR_ARGS;
    }

    # require wal_level >= hot_standby
    %cdata = _get_controldata();
    unless ( $cdata{'wal_level'} =~ m{hot_standby|logical|replica} ) {
        ocf_exit_reason(
            'wal_level must be one of "hot_standby", "logical" or "replica"' );
        return $OCF_ERR_ARGS;
    }

    return $OCF_SUCCESS;
}


# Start the PostgreSQL instance as a *secondary*
#
sub pgsql_start {
    my $rc         = pgsql_monitor();
    my %cdata      = _get_controldata();
    my $prev_state = $cdata{'state'};

    # Instance must be running as secondary or being stopped.
    # Anything else is an error.
    if ( $rc == $OCF_SUCCESS ) {
        ocf_log( 'info', 'Instance "%s" already started',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_SUCCESS;
    }
    elsif ( $rc != $OCF_NOT_RUNNING ) {
        ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)',
            $OCF_RESOURCE_INSTANCE, $rc );
        return $OCF_ERR_GENERIC;
    }

    #
    # From here, the instance is NOT running for sure.
    #

    ocf_log( 'debug',
        'pgsql_start: instance "%s" is not running, starting it as a secondary',
        $OCF_RESOURCE_INSTANCE );

    # Must start as a standby, so enable recovery.
    _enable_recovery();

    # Start the instance as a secondary.
    $rc = _pg_ctl_start();

    if ( $rc == 0 ) {

        # Wait for the start to finish.
        sleep 1 while ( $rc = pgsql_monitor() ) == $OCF_NOT_RUNNING;

        if ( $rc == $OCF_SUCCESS ) {
            ocf_log( 'info', 'Instance "%s" started', $OCF_RESOURCE_INSTANCE );

            # Check if a master score exists in the cluster.
            # During the very first start of the cluster, no master score will
            # exists on any of the existing slaves, unless an admin designated
            # one of them using crm_master. If no master exists the cluster will
            # not promote a master among the slaves.
            # To solve this situation, we check if there is at least one master
            # score existing on one node in the cluster. Do nothing if at least
            # one master score is found among the clones of the resource. If no
            # master score exists, set a score of 1 only if the resource was a
            # shut downed master before the start.
            if ( $prev_state eq "shut down" and not _master_score_exists() ) {
                ocf_log( 'info', 'No master score around. Set mine to 1' );

                _set_master_score( '1' );
            }

            return $OCF_SUCCESS;
        }

        ocf_exit_reason(
            'Instance "%s" is not running as a slave (returned %d)',
             $OCF_RESOURCE_INSTANCE, $rc );

        return $OCF_ERR_GENERIC;
    }

    ocf_exit_reason( 'Instance "%s" failed to start (rc: %d)',
        $OCF_RESOURCE_INSTANCE, $rc );

    return $OCF_ERR_GENERIC;
}

# Stop the PostgreSQL instance
#
sub pgsql_stop {
    my $rc;
    my $state;
    my $pidfile = "$datadir/postmaster.pid";
    # Add 60s to the timeout or use a 24h timeout fallback to make sure
    # Pacemaker will give up before us and take decisions
    my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60;

    # Instance must be running as secondary or primary or being stopped.
    # Anything else is an error.
    $rc = pgsql_monitor();
    if ( $rc == $OCF_NOT_RUNNING ) {
        ocf_log( 'info', 'Instance "%s" already stopped',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_SUCCESS;
    }
    elsif ( $rc != $OCF_SUCCESS and $rc != $OCF_RUNNING_MASTER ) {
        ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)',
            $OCF_RESOURCE_INSTANCE, $rc );
        return $OCF_ERR_GENERIC;
    }

    #
    # From here, the instance is running for sure.
    #

    ocf_log( 'debug', 'pgsql_stop: instance "%s" is running, stopping it',
        $OCF_RESOURCE_INSTANCE );

    # Try to quit with proper shutdown.


    $rc = _runas( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout,
        '-m', 'fast', 'stop' );

    if ( $rc == 0 ) {
        # Wait for the stop to finish.
        sleep 1 while ( $rc = pgsql_monitor() ) != $OCF_NOT_RUNNING ;

        ocf_log( 'info', 'Instance "%s" stopped', $OCF_RESOURCE_INSTANCE );

        return $OCF_SUCCESS;
    }

    ocf_exit_reason( 'Instance "%s" failed to stop', $OCF_RESOURCE_INSTANCE );
    return $OCF_ERR_GENERIC;
}

# Monitor the PostgreSQL instance
#
sub pgsql_monitor {
    my $pgisready_rc;
    my $controldata_rc;

    ocf_log( 'debug', 'pgsql_monitor: monitor is a probe' ) if ocf_is_probe();

    # First check, verify if the instance is listening.
    $pgisready_rc = _pg_isready();

    if ( $pgisready_rc == 0 ) {
        # The instance is listening.
        # We confirm that the instance is up and return if it is a primary or a
        # secondary
        ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening',
            $OCF_RESOURCE_INSTANCE );
        return _confirm_role();
    }

    if ( $pgisready_rc == 1 ) {
        # The attempt was rejected.
        # This could happen in several cases:
        #   - at startup
        #   - during shutdown
        #   - during crash recovery
        #   - if instance is a warm standby
        # Except for the warm standby case, this should be a transitional state.
        # We try to confirm using pg_controldata.
        ocf_log( 'debug',
            'pgsql_monitor: instance "%s" rejects connections - checking again...',
            $OCF_RESOURCE_INSTANCE );
        $controldata_rc = _controldata_to_ocf();

        if ( $controldata_rc == $OCF_RUNNING_MASTER
            or $controldata_rc == $OCF_SUCCESS
        ) {
            # This state indicates that pg_isready check should succeed.
            # We check again.
            ocf_log( 'debug',
                'pgsql_monitor: instance "%s" controldata shows a running status',
                $OCF_RESOURCE_INSTANCE );

            $pgisready_rc = _pg_isready();
            if ( $pgisready_rc == 0 ) {
                # Consistent with pg_controdata output.
                # We can check if the instance is primary or secondary
                ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening',
                    $OCF_RESOURCE_INSTANCE );
                return _confirm_role();
            }

            # Still not consistent, raise an error.
            # NOTE: if the instance is a warm standby, we end here.
            # TODO raise an hard error here ?
            ocf_exit_reason(
                'Instance "%s" controldata is not consistent with pg_isready (returned: %d)',
                $OCF_RESOURCE_INSTANCE, $pgisready_rc );
            ocf_log( 'info',
                'If this instance is in warm standby, this resource agent only supports hot standby',
                $OCF_RESOURCE_INSTANCE, $pgisready_rc );

            return $OCF_ERR_GENERIC;
        }

        if ( $controldata_rc == $OCF_NOT_RUNNING ) {
            # This state indicates that pg_isready check should fail with rc 2.
            # We check again.
            $pgisready_rc = _pg_isready();
            if ( $pgisready_rc == 2 ) {
                # Consistent with pg_controdata output.
                # We check the process status using pg_ctl status and check
                # if it was propertly shut down using pg_controldata.
                ocf_log( 'debug',
                    'pgsql_monitor: instance "%s" is not listening',
                    $OCF_RESOURCE_INSTANCE );
                return _confirm_stopped();
            }
            # Still not consistent, raise an error.
            # TODO raise an hard error here ?
            ocf_exit_reason(
                'Instance "%s" controldata is not consistent with pg_isready (returned: %d)',
                $OCF_RESOURCE_INSTANCE, $pgisready_rc );

            return $OCF_ERR_GENERIC;
        }

        # Something went wrong with the controldata check, hard fail.
        ocf_exit_reason(
            'Could not get instance "%s" status from controldata (returned: %d)',
            $OCF_RESOURCE_INSTANCE, $controldata_rc );

        return $OCF_ERR_INSTALLED;
    }

    elsif ( $pgisready_rc == 2 ) {
        # The instance is not listening.
        # We check the process status using pg_ctl status and check
        # if it was propertly shut down using pg_controldata.
        ocf_log( 'debug', 'pgsql_monitor: instance "%s" is not listening',
            $OCF_RESOURCE_INSTANCE );
        return _confirm_stopped();
    }

    elsif ( $pgisready_rc == 3 ) {
        # No attempt was done, probably a syntax error.
        # Hard configuration error, we don't want to retry or failover here.
        ocf_exit_reason(
            'Unknown error while checking if instance "%s" is listening (returned %d)',
            $OCF_RESOURCE_INSTANCE, $pgisready_rc );

        return $OCF_ERR_CONFIGURED;
    }

    ocf_exit_reason( 'Unexpected result when checking instance "%s" status',
        $OCF_RESOURCE_INSTANCE );

    return $OCF_ERR_GENERIC;
}


# Demote the PostgreSQL instance from primary to secondary
# To demote a PostgreSQL instance, we must:
#   * stop it gracefully
#   * create recovery.conf with standby_mode = on
#   * start it
#
sub pgsql_demote {
    my $rc;

    $rc = pgsql_monitor();

    # Running as primary. Normal, expected behavior.
    if ( $rc == $OCF_RUNNING_MASTER ) {
        ocf_log( 'debug', 'pgsql_demote: "%s" currently running as a primary',
            $OCF_RESOURCE_INSTANCE )  ;
    }
    elsif ( $rc == $OCF_SUCCESS ) {
        # Already running as secondary. Nothing to do.
        ocf_log( 'debug',
            'pgsql_demote: "%s" currently running as a secondary',
            $OCF_RESOURCE_INSTANCE );
            return $OCF_SUCCESS;
    }
    elsif ( $rc == $OCF_NOT_RUNNING ) {
        # Instance is stopped. Nothing to do.
        ocf_log( 'debug', 'pgsql_demote: "%s" currently shut down',
            $OCF_RESOURCE_INSTANCE );
    }
    elsif ( $rc == $OCF_ERR_CONFIGURED ) {
        # We actually prefer raising a hard or fatal error instead of leaving
        # the CRM abording its transition for a new one because of a soft error.
        # The hard error will force the CRM to move the resource immediately.
        return $OCF_ERR_CONFIGURED;
    }
    else {
        return $OCF_ERR_GENERIC;
    }

    # TODO we need to make sure at least one slave is connected!!

    # WARNING if the resource state is stopped instead of master, the ocf ra dev
    # rsc advises to return OCF_ERR_GENERIC, misleading the CRM in a loop where
    # it computes transitions of demote(failing)->stop->start->promote actions
    # until failcount == migration-threshold.
    # This is a really ugly trick to keep going with the demode action if the
    # rsc is already stopped gracefully.
    # See discussion "CRM trying to demote a stopped resource" on
    # developers@clusterlabs.org
    unless ( $rc == $OCF_NOT_RUNNING ) {
        # Add 60s to the timeout or use a 24h timeout fallback to make sure
        # Pacemaker will give up before us and take decisions
        my $timeout = ( _get_action_timeout() || 60*60*24 )  + 60;

        # WARNING the instance **MUST** be stopped gracefully.
        # Do **not** use pg_stop() or service or systemctl here as these
        # commands might force-stop the PostgreSQL instance using immediate
        # after some timeout and return success, which is misleading.

        $rc = _runas( $PGCTL, '--pgdata', $pgdata, '--mode', 'fast', '-w',
            '--timeout', $timeout , 'stop' );

        # No need to wait for stop to complete, this is handled in pg_ctl
        # using -w option.
        unless ( $rc == 0 ) {
            ocf_exit_reason( 'Failed to stop "%s" using pg_ctl (returned %d)',
                $OCF_RESOURCE_INSTANCE, $rc );
            return $OCF_ERR_GENERIC;
        }

        # Double check that the instance is stopped correctly.
        $rc = pgsql_monitor();
        unless ( $rc == $OCF_NOT_RUNNING ) {
            ocf_exit_reason(
                'Unexpected "%s" state: monitor status (%d) disagree with pg_ctl return code',
                $OCF_RESOURCE_INSTANCE, $rc );
            return $OCF_ERR_GENERIC;
        }
    }

    #
    # At this point, the instance **MUST** be stopped gracefully.
    #

    # Note: We do not need to handle the recovery.conf file here as pgsql_start
    # deal with that itself. Equally, no need to wait for the start to complete
    # here, handled in pgsql_start.
    $rc = pgsql_start();
    if ( $rc == $OCF_SUCCESS ) {
        ocf_log( 'info', 'pgsql_demote: "%s" started as a secondary',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_SUCCESS;
    }

    # NOTE: No need to double check the instance state as pgsql_start already use
    # pgsql_monitor to check the state before returning.

    ocf_exit_reason( 'Starting "%s" as a standby failed (returned %d)',
        $OCF_RESOURCE_INSTANCE, $rc );
    return $OCF_ERR_GENERIC;
}


# Promote the secondary instance to primary
#
sub pgsql_promote {
    my $rc;
    my $cancel_switchover;

    $rc = pgsql_monitor();

    if ( $rc == $OCF_SUCCESS ) {
        # Running as slave. Normal, expected behavior.
        ocf_log( 'debug', 'pgsql_promote: "%s" currently running as a standby',
            $OCF_RESOURCE_INSTANCE );
    }
    elsif ( $rc == $OCF_RUNNING_MASTER ) {
        # Already a master. Unexpected, but not a problem.
        ocf_log( 'info', '"%s" already running as a primary',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_SUCCESS;
    }
    elsif ( $rc == $OCF_NOT_RUNNING ) { # INFO this is not supposed to happen.
        # Currently not running. Need to start before promoting.
        ocf_log( 'info', '"%s" currently not running, starting it',
            $OCF_RESOURCE_INSTANCE );

        $rc = pgsql_start();
        if ( $rc != $OCF_SUCCESS ) {
            ocf_exit_reason( 'Failed to start the instance "%s"',
                $OCF_RESOURCE_INSTANCE );
            return $OCF_ERR_GENERIC;
        }
    }
    else {
        ocf_exit_reason( 'Unexpected error, cannot promote "%s"',
            $OCF_RESOURCE_INSTANCE );
        return $OCF_ERR_GENERIC;
    }

    #
    # At this point, the instance **MUST** be started as a secondary.
    #

    # Cancel the switchover if it has been considered not safe during the
    # pre-promote action
    $cancel_switchover = _get_priv_attr('cancel_switchover');
    if ( $cancel_switchover ) { # if not empty or not 0
        ocf_exit_reason( 'Switchover has been canceled from pre-promote action' );

        _delete_priv_attr( 'cancel_switchover' );

        return $OCF_ERR_GENERIC if $cancel_switchover eq '1';
        return $OCF_ERR_ARGS; # ban the resource from the node if we have an
                              # internal error during _check_switchover
    }

    # Do not check for a better candidate if we try to recover the master
    # Recover of a master is detected during the pre-promote action. It sets the
    # private attribute 'recover_master' to '1' if this is a master recover.
    if ( _get_priv_attr( 'recover_master' ) eq '1' ) {
        ocf_log( 'info', 'Recovering old master, no election needed');
    }
    else {

        # The promotion is occurring on the best known candidate (highest
        # master score), as chosen by pacemaker during the last working monitor
        # on previous master (see pgsql_monitor/_check_locations subs).
        # To avoid any race condition between the last monitor action on the
        # previous master and the **real** most up-to-date standby, we
        # set each standby location during the "pre-promote" action, and stored
        # them using the "lsn_location" resource attribute.
        #
        # The best standby to promote would have the highest known LSN. If the
        # current resource is not the best one, we need to modify the master
        # scores accordingly, and abort the current promotion.
        ocf_log( 'debug',
            'pgsql_promote: checking if current node is the best candidate for promotion' );

        # Exclude nodes that are known to be unavailable (not in the current
        # partition) using the "crm_node" command
        my @active_nodes    = split /\s+/ => _get_priv_attr( 'nodes' );
        my $node_to_promote = '';
        my $ans;
        my $max_tl;
        my $max_lsn;
        my $node_tl;
        my $node_lsn;
        my $wal_num;
        my $wal_off;

        # Get the "lsn_location" attribute value for the current node, as set
        # during the "pre-promote" action.
        # It should be the greatest among the secondary instances.
        $ans = _get_priv_attr( 'lsn_location' );

        if ( $ans eq '' ) {
            # This should not happen as the "lsn_location" attribute should have
            # been updated during the "pre-promote" action.
            ocf_exit_reason( 'Can not get current node LSN location' );
            return $OCF_ERR_GENERIC;
        }

        chomp $ans;
        ( $max_tl, $max_lsn ) = split /#/, $ans;

        ocf_log( 'debug', 'pgsql_promote: current node TL#LSN location: %s#%s',
            $max_tl, $max_lsn );

        # Now we compare with the other available nodes.
        foreach my $node ( @active_nodes ) {
            # We exclude the current node from the check.
            next if $node eq $nodename;

            # Get the "lsn_location" attribute value for the node, as set during
            # the "pre-promote" action.
            # This is implemented as a loop as private attributes are asynchronously
            # available from other nodes.
            # see: https://github.com/ClusterLabs/PAF/issues/131
            # NOTE: if a node did not set its lsn_location for some reason, this will end
            # with a timeout and the whole promotion will start again.
            WAIT_FOR_LSN: {
                $ans = _get_priv_attr( 'lsn_location', $node );
                if ( $ans eq '' ) {
                    ocf_log( 'info', 'pgsql_promote: waiting for LSN from %s', $node );
                    select( undef, undef, undef, 0.1 );
                    redo WAIT_FOR_LSN;
                }
            }

            chomp $ans;
            ( $node_tl, $node_lsn ) = split /#/, $ans;

            ocf_log( 'debug',
                'pgsql_promote: comparing with "%s": TL#LSN is %s#%s',
                $node, $node_tl, $node_lsn );

            # If the node has a higher LSN, select it as a best candidate to
            # promotion and keep looping to check the TL/LSN of other nodes.
            if ( $node_tl > $max_tl
                or ( $node_tl == $max_tl and $node_lsn > $max_lsn )
            ) {
                ocf_log( 'debug',
                    'pgsql_promote: "%s" is a better candidate to promote (%s#%s > %s#%s)',
                    $node, $node_tl, $node_lsn, $max_tl, $max_lsn );
                $node_to_promote = $node;
                $max_tl          = $node_tl;
                $max_lsn         = $node_lsn;
            }
        }

        # If any node has been selected, we adapt the master scores accordingly
        # and break the current promotion.
        if ( $node_to_promote ne '' ) {
            ocf_exit_reason(
                '%s is the best candidate to promote, aborting current promotion',
                $node_to_promote );

            # Reset current node master score.
            _set_master_score( '1' );

            # Set promotion candidate master score.
            _set_master_score( '1000', $node_to_promote );

            # We fail the promotion to trigger another promotion transition
            # with the new scores.
            return $OCF_ERR_GENERIC;
        }

        # Else, we will keep on promoting the current node.
    }

    unless (
        # Promote the instance on the current node.
        _runas( $PGCTL, '--pgdata', $pgdata, '-w', 'promote' ) == 0 )
    {
        ocf_exit_reason( 'Error during promotion command' );
        return $OCF_ERR_GENERIC;
    }

    # The instance promotion is asynchronous, so we need to wait for this
    # process to complete.
    while ( pgsql_monitor() != $OCF_RUNNING_MASTER ) {
        ocf_log( 'info', 'Waiting for the promote to complete' );
        sleep 1;
    }

    ocf_log( 'info', 'Promote complete' );

    return $OCF_SUCCESS;
}

# This action is called **before** the actual promotion when a failing master is
# considered unreclaimable, recoverable or a new master must be promoted
# (switchover or first start).
# As every "notify" action, it is executed almost simultaneously on all
# available nodes.
sub pgsql_notify_pre_promote {
    my $rc;
    my $node_tl;
    my $node_lsn;
    my %cdata;
    my %active_nodes;
    my $attr_nodes;

    ocf_log( 'info', 'Promoting instance on node "%s"',
        $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} );

    # No need to do an election between slaves if this is recovery of the master
    if ( _is_master_recover( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} ) ) {
        ocf_log( 'warning', 'This is a master recovery!' );

        _set_priv_attr( 'recover_master', '1' )
            if $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename;

        return $OCF_SUCCESS;
    }

    # Environment cleanup!
    _delete_priv_attr( 'lsn_location'      );
    _delete_priv_attr( 'recover_master'    );
    _delete_priv_attr( 'nodes'             );
    _delete_priv_attr( 'cancel_switchover' );

    # check for the last received entry of WAL from the master if we are
    # the designated slave to promote
    if ( _is_switchover( $nodename ) and scalar
         grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'promote'} }
    ) {
        $rc = _check_switchover();

        unless ( $rc == 0 ) {
            # Shortcut the election process as the switchover will be
            # canceled
            _set_priv_attr( 'cancel_switchover', $rc );
            return $OCF_SUCCESS; # return code is ignored during notify
        }

        # If the sub keeps going, that means the switchover is safe.
        # Keep going with the election process in case the switchover was
        # instruct to the wrong node.
        # FIXME: should we allow a switchover to a lagging slave?
    }

    # We need to trigger an election between existing slaves to promote the best
    # one based on its current LSN location. Each node set a private attribute
    # "lsn_location" with its TL and LSN location.
    #
    # During the following promote action, The designated standby for
    # promotion use these attributes to check if the instance to be promoted
    # is the best one, so we can avoid a race condition between the last
    # successful monitor on the previous master and the current promotion.

    # As we can not break the transition from a notification action, we check
    # during the promotion if each node TL and LSN are valid.

    # Force a checpoint to make sure the controldata shows the very last TL
    _query( q{ CHECKPOINT }, {} );
    %cdata    = _get_controldata();
    $node_lsn = _get_last_received_lsn( 'in decimal' );

    unless ( defined $node_lsn ) {
        ocf_log( 'warning', 'Unknown current node LSN' );
        # Return code are ignored during notifications...
        return $OCF_SUCCESS;
    }

    $node_lsn = "$cdata{'tl'}#$node_lsn";

    ocf_log( 'info', 'Current node TL#LSN: %s', $node_lsn );

    # Set the "lsn_location" attribute value for this node so we can use it
    # during the following "promote" action.
    _set_priv_attr( 'lsn_location', $node_lsn );

    ocf_log( 'warning', 'Could not set the current node LSN' )
        if $? != 0 ;

    # If this node is the future master, keep track of the slaves that
    # received the same notification to compare our LSN with them during
    # promotion
    if ( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename ) {
        # Build the list of active nodes:
        #   master + slave + start - stop
        # FIXME: Deal with rsc started during the same transaction but **after**
        #        the promotion ?
        $active_nodes{ $_->{'uname'} }++ foreach @{ $OCF_NOTIFY_ENV{'active'} },
                                                 @{ $OCF_NOTIFY_ENV{'start'} };
        $active_nodes{ $_->{'uname'} }-- foreach @{ $OCF_NOTIFY_ENV{'stop'} };

        $attr_nodes = join " "
            => grep { $active_nodes{$_} > 0 } keys %active_nodes;

        _set_priv_attr( 'nodes', $attr_nodes );
    }

    return $OCF_SUCCESS;
}

# This action is called after a promote action.
sub pgsql_notify_post_promote {

    # We have a new master (or the previous one recovered).
    # Environment cleanup!
    _delete_priv_attr( 'lsn_location'      );
    _delete_priv_attr( 'recover_master'    );
    _delete_priv_attr( 'nodes'             );
    _delete_priv_attr( 'cancel_switchover' );

    return $OCF_SUCCESS;
}

# This is called before a demote occurs.
sub pgsql_notify_pre_demote {
    my $rc;
    my %cdata;

    # do nothing if the local node will not be demoted
    return $OCF_SUCCESS unless scalar
        grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'demote'} };

    $rc = pgsql_monitor();

    # do nothing if this is not a master recovery
    return $OCF_SUCCESS unless _is_master_recover( $nodename )
                           and $rc == $OCF_FAILED_MASTER;

    # in case of master crash, we need to detect if the CRM tries to recover
    # the master clone. The usual transition is to do:
    #   demote->stop->start->promote
    #
    # There are multiple flaws with this transition:
    #  * the 1st and 2nd actions will fail because the instance is in
    #    OCF_FAILED_MASTER step
    #  * the usual start action is dangerous as the instance will start with
    #    a recovery.conf instead of entering a normal recovery process
    #
    # To avoid this, we try to start the instance in recovery from here.
    # If it success, at least it will be demoted correctly with a normal
    # status. If it fails, it will be catched up in next steps.

    ocf_log( 'info', 'Trying to start failing master "%s"...',
        $OCF_RESOURCE_INSTANCE );

    # Either the instance managed to start or it couldn't.
    # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't
    # start, this error will be catched up later during the various checks
    _pg_ctl_start();

    %cdata = _get_controldata();

    ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} );

    return $OCF_SUCCESS;
}

# This is called before a stop occurs.
sub pgsql_notify_pre_stop {
    my $rc;
    my %cdata;

    # do nothing if the local node will not be stopped
    return $OCF_SUCCESS unless scalar
        grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'stop'} };

    $rc = _controldata_to_ocf();

    # do nothing if this is not a slave recovery
    return $OCF_SUCCESS unless _is_slave_recover( $nodename )
                           and $rc == $OCF_RUNNING_SLAVE;

    # in case of slave crash, we need to detect if the CRM tries to recover
    # the slaveclone. The usual transition is to do: stop->start
    #
    # This transition can no twork because the instance is in
    # OCF_ERR_GENERIC step. So the stop action will fail, leading most
    # probably to fencing action.
    #
    # To avoid this, we try to start the instance in recovery from here.
    # If it success, at least it will be stopped correctly with a normal
    # status. If it fails, it will be catched up in next steps.

    ocf_log( 'info', 'Trying to start failing slave "%s"...',
        $OCF_RESOURCE_INSTANCE );

    # Either the instance managed to start or it couldn't.
    # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't
    # start, this error will be catched up later during the various checks
    _pg_ctl_start();

    %cdata = _get_controldata();

    ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} );

    return $OCF_SUCCESS;
}

# Notify type actions, called on all available nodes before (pre) and after
# (post) other actions, like promote, start, ...
#
sub pgsql_notify {
    my $type_op;

    ocf_log( 'debug', "pgsql_notify: environment variables: %s",
        Data::Dumper->new( [ \%OCF_NOTIFY_ENV ] )->Sortkeys(1)->Terse(1)->Dump );

    return unless %OCF_NOTIFY_ENV;

    $type_op = "$OCF_NOTIFY_ENV{'type'}-$OCF_NOTIFY_ENV{'operation'}";

    for ( $type_op ) {
        if    ( /^pre-promote$/  ) { return pgsql_notify_pre_promote()  }
        elsif ( /^post-promote$/ ) { return pgsql_notify_post_promote() }
        elsif ( /^pre-demote$/   ) { return pgsql_notify_pre_demote()   }
        elsif ( /^pre-stop$/     ) { return pgsql_notify_pre_stop()     }
    }

    return $OCF_SUCCESS;
}

# Action used to allow for online modification of resource parameters value.
#
sub pgsql_reload {

    # No action necessary, the action declaration is enough to inform pacemaker
    # that the modification of any non-unique parameter can be applied without
    # having to restart the resource.
    ocf_log( 'info', 'Instance "%s" reloaded', $OCF_RESOURCE_INSTANCE );
    return $OCF_SUCCESS;

}

############################################################
#### MAIN

exit ocf_meta_data() if $__OCF_ACTION eq 'meta-data';
exit ocf_methods()   if $__OCF_ACTION eq 'methods';

# Avoid "could not change directory" when executing commands as "system-user".
chdir File::Spec->tmpdir();

# mandatory sanity checks
# check pgdata
if ( ! -d $pgdata ) {
    ocf_exit_reason( 'PGDATA "%s" does not exist', $pgdata );
    exit $OCF_ERR_ARGS;
}

# check datadir
if ( ! -d $datadir ) {
    ocf_exit_reason( 'data_directory "%s" does not exist', $datadir );
    exit $OCF_ERR_ARGS;
}

# Set PostgreSQL version
$PGVERNUM = _get_pg_version();

# Set current node name.
$nodename = ocf_local_nodename();

$exit_code = pgsql_validate_all();

exit $exit_code if $exit_code != $OCF_SUCCESS or $__OCF_ACTION eq 'validate-all';

# Run action
for ( $__OCF_ACTION ) {
    if    ( /^start$/     ) { $exit_code = pgsql_start()   }
    elsif ( /^stop$/      ) { $exit_code = pgsql_stop()    }
    elsif ( /^monitor$/   ) { $exit_code = pgsql_monitor() }
    elsif ( /^promote$/   ) { $exit_code = pgsql_promote() }
    elsif ( /^demote$/    ) { $exit_code = pgsql_demote()  }
    elsif ( /^notify$/    ) { $exit_code = pgsql_notify()  }
    elsif ( /^reload$/    ) { $exit_code = pgsql_reload()  }
    else  { $exit_code = $OCF_ERR_UNIMPLEMENTED }
}

exit $exit_code;


=head1 EXAMPLE CRM SHELL

The following is an example configuration for a pgsqlms resource using the
crm(8) shell:

  primitive pgsqld pgsqlms                                                 \
    params pgdata="/var/lib/postgresql/9.6/main"                           \
      bindir="/usr/lib/postgresql/9.6/bin"                                 \
      pghost="/var/run/postgresql"                                         \
      recovery_template="/etc/postgresql/9.6/main/recovery.conf.pcmk"      \
      start_opts="-c config_file=/etc/postgresql/9.6/main/postgresql.conf" \
    op start timeout=60s                                                   \
    op stop timeout=60s                                                    \
    op promote timeout=30s                                                 \
    op demote timeout=120s                                                 \
    op monitor interval=15s timeout=10s role="Master"                      \
    op monitor interval=16s timeout=10s role="Slave"                       \
    op notify timeout=60s

  ms pgsql-ha pgsqld meta notify=true


=head1 EXAMPLE PCS

The following is an example configuration for a pgsqlms resource using pcs(8):

  pcs resource create pgsqld ocf:heartbeat:pgsqlms            \
    bindir=/usr/pgsql-9.6/bin pgdata=/var/lib/pgsql/9.6/data  \
    op start timeout=60s                                      \
    op stop timeout=60s                                       \
    op promote timeout=30s                                    \
    op demote timeout=120s                                    \
    op monitor interval=15s timeout=10s role="Master"         \
    op monitor interval=16s timeout=10s role="Slave"          \
    op notify timeout=60s --master notify=true

=head1 SEE ALSO

http://clusterlabs.org/

=head1 AUTHOR

Jehan-Guillaume de Rorthais and Mael Rimbault.

=cut
