%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Part of the replication package for the paper
%   "Marginal Effects for Probit and Tobit with Endogeneity"
%   by Kirill S. Evdokimov, Ilze Kalnina, and Andrei Zeleneev.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [theta_hat, loglik_val] = tobit_MLE(y0, x0, theta_0)
%function [theta_hat] = tobit_MLE(y, x)
%  Estimates theta_hat = [beta_hat; sigma_hat] via MLE
  flag_scale_xy = 1; % without scaling the optimization can be numerically unstable, and may not pass the 'DerivativeCheck'
  if flag_scale_xy
    mul_y = sqrt(mean(y0.^2));
    mul_x = sqrt(mean(x0.^2));
  else
    mul_y = 1; mul_x = 1;
  end
  assert(all(y0>=0), "tobit_MLE: Error -- there are Y_i < 0");
  IYp = y0>0;
  if ~any(IYp)
    error('tobit_MLE: Error -- Y_i = 0 for all i, cannot estimate parameters');
  end
  
  y = y0/mul_y; x = x0./mul_x;
  n = size(y,1);  assert(all(size(y)==[n,1]));
  assert(size(x,1)==n);
  
  if nargin<3
    beta_0 = (x\y);
    sig2_0 = var(y-x*beta_0);
  else
    theta_0 = theta_0(:);
    assert(length(theta_0)==size(x,2)+1);
    beta_0 = theta_0(1:end-1).*mul_x(:)/mul_y;
    sig2_0 = (theta_0(end)/mul_y).^2; assert(theta_0(end)>0);
  end
    
  persistent opt_dim_beta_0 opt_options
  if isempty(opt_dim_beta_0) || isempty(opt_options) || (opt_dim_beta_0~=length(beta_0))
    % storing opt_options between calls because optimoptions is very slow (at least on Matlab 2022a)
    opt_dim_beta_0 = length(beta_0); %determines MaxFunctionEvaluations
    opt_options = optimoptions(@fminunc,'Algorithm', 'quasi-newton' ... 
        , 'SpecifyObjectiveGradient', true ...
        , 'OptimalityTolerance', 1e-10, 'StepTolerance', 1e-10, 'MaxFunctionEvaluations', 1e3*(length(beta_0)+1) ...
        , 'Display', 'off' ... 'final-detailed' ... 
        , 'CheckGradients', false); %true);
  end
  
  % Reparameterization
  gam_0 = 1/sqrt(sig2_0);
  delta_0 = gam_0*beta_0;
  
  b_0 = [delta_0; gam_0];
  
  test_val = negative_tobit_loglik(b_0); 
  isok = @(x) ~isnan(x) && ~isinf(x);
  if ~isok(test_val) % could happen because of log(erfc(...)), trying alternative initial values
    arr_b_try = [[0*delta_0; gam_0] [0*delta_0; 1] [0*delta_0; 1e-4] [0*delta_0; 1e4]];
    best_val = +Inf; best_b = b_0;
    for i_try = 1:size(arr_b_try,2)
      b_try = arr_b_try(:,i_try);
      val = negative_tobit_loglik(b_try);
      if isok(val) && val<best_val
        best_b = b_try; best_val = val;
      end
    end
    b_0 = best_b;
  end

  [b_hat,loglik_val_w_mul,exitflag,output] = fminunc(@negative_tobit_loglik, b_0, opt_options);
  
  % to get correct loglik_val we return to y0 and x0
  b_hat_wo_normaliz = [b_hat(1:end-1)./mul_x(:); b_hat(end)/mul_y];
  if flag_scale_xy
    y = y0; x = x0;
  end
  loglik_val = -negative_tobit_loglik(b_hat_wo_normaliz);
    
  theta_hat = [b_hat(1:end-1)/b_hat(end); 1/b_hat(end)];
  theta_hat = theta_hat*mul_y;
  theta_hat(1:end-1) = theta_hat(1:end-1)./mul_x(:);

  function [loglik, grad] = negative_tobit_loglik(b, bReturnScoreVariance)
    if nargin<2; bReturnScoreVariance=0; end %if true, returns covariance of the score in 'grad' output
    delta = b(1:end-1);
    gam = b(end);
    % v_llik_old = IYp.*(-(y*gam - x*delta).^2 - log(2*pi) - 2*log(1/gam))/2 + (1-IYp).*log(erfc((x*delta)/sqrt(2))/2);
    v_llik = IYp.*(-(y*gam - x*delta).^2 - log(2*pi) - 2*log(1/gam))/2 + (1-IYp).*log_normcdf(-x*delta);
    % assert_feq(v_llik_old, v_llik);
    loglik = -full(sum(v_llik));
    
    if nargout>=2
      score = [IYp.*x.*(y*gam - x*delta) - (1 - IYp).*x.*inv_mills_ratio(x*delta) ...
       , IYp.*(1/gam - y.^2*gam + (x*delta).*y)];
      
      if bReturnScoreVariance
        grad = full(cov(score)); % for Asy Var calculation (but note that this is in the (delta,gamma) parameterization!
      else
        grad = -full(sum(score));
      end
    end
  end

end %tobit_MLE()

function s = log_normcdf(s)
  % computes log(normcdf(s))
  todo = s<0; %for these we will replace 1-normcdf(s) with erfcx...
  s(~todo) = log(erfc(-s(~todo)/sqrt(2)))-log(2);
  s(todo) = log(erfcx(-s(todo)/sqrt(2)))-(s(todo).^2/2)-log(2);
end

function s = inv_mills_ratio(s)
  % computes  normpdf(s)./(1-normcdf(s))
  s = sqrt(2/pi)./erfcx(s/sqrt(2)); % works well for all s, including very large
end

