% ?Please cite the following paper:
% ?Huang D., Jiang F., Li K., Tong G. and Zhou G., "Scaled PCA: A New Approach to Dimension Reduction", Management Science 68(3), 2022, 1591-2376. 

function f=sPCAest(target, X, nfac)

%   Input variables:
%   ---------------------------------------------------------
%   X               = T by N Matrix of X variables values
%   target          = T by 1 vector of Y variable values
%   nfac            = number of sPCA factors to be extracted

%   Output
%   f               = sPCA factors

T=length(target);
if (length(X) ~= T)
    error('X and Y varibles not of equal length');
end

% standardize X to Xs
Xs=standard(X);
beta=NaN(1,size(Xs,2));
for j=1:size(Xs,2)
    xvar=[ones(T,1) Xs(:,j)];
    parm=xvar\target;
    beta(j)=parm(2);
end

% one can choose to winsorize to remove extreme values
% beta_win=winsor(abs(beta),[0 99]);

scaleXs=NaN(size(Xs,1),size(Xs,2));
for j=1:size(Xs,2)
     scaleXs(:,j)=Xs(:,j)*beta(j); 
%    scaleXs(:,j)=Xs(:,j)*beta_win(j);  
end

[~,f,~,~, ~]=pc_T(scaleXs,nfac);


%% sub functions
% principal components with normalization F'F/T=I

% X is observed

% r is the true number of true factors

% F is T by r matrix of true factors

% Lambda N by r is the true loading matrix

% C=F*Lambda' T by N is the true common component

% chat is the estimated common component

function [ehat,fhat,lambda,ve2,ss]=pc_T(y,nfac) %adding ve2

[bigt,bign]=size(y);

yy=y*y';

[Fhat0,eigval,Fhat1]=svd(yy); %for semi-def symmetric matrix, same as eig value decomposition but sorts in descending order

fhat=Fhat0(:,1:nfac)*sqrt(bigt);

lambda=y'*fhat/bigt;

%chi2=fhat*lambda';

%diag(lambda'*lambda)

%diag(fhat'*fhat)                % this should equal the largest eigenvalues

%sum(diag(eigval(1:nfac,1:nfac)))/sum(diag(eigval))

%mean(var(chi2))                 % this should equal decomposition of variance



ehat=y-fhat*lambda';



ve2=sum(ehat'.*ehat')'/bign;

ss=diag(eigval);


function[y,varargout] = winsor(x,p)
% WINSOR     Winsorize a vector 
% INPUTS   : x - n*1 data vector
%            p - 2*1 vector of cut-off percentiles (left, right) 
% OUTPUTS  : y - winsorized x, n*1 vector
%            i - (optional) n*1 value-replaced-indicator vector
% NOTES    : Let p1 = prctile(x,p(1)), p2 = prctile(x,p(2)). (Note
%            that PRCTILE ignores NaN values). Then 
%            if x(i) < p1, y(i) = min(x(j) | x(j) >= p1)
%            if x(i) > p2, y(i) = max(x(j) | x(j) <= p2)
% EXAMPLE  : x = rand(10,1), y = winsor(x,[10 90])
% AUTHOR   : Dimitri Shvorob, dimitri.shvorob@vanderbilt.edu, 4/15/07

if ~isvector(x)
   error('Input argument "x" must be a vector')
end  
if nargin < 2
   error('Input argument "p" is undefined')
end 
if ~isvector(p)
   error('Input argument "p" must be a vector')
end  
if length(p) ~= 2
   error('Input argument "p" must be a 2*1 vector')
end  
if p(1) < 0 || p(1) > 100
   error('Left cut-off percentile is out of [0,100] range')
end  
if p(2) < 0 || p(2) > 100
   error('Right cut-off percentile is out of [0,100] range')
end  
if p(1) > p(2)
   error('Left cut-off percentile exceeds right cut-off percentile')
end  
p = prctile(x,p);
i1 = x < p(1); v1 = min(x(~i1));
i2 = x > p(2); v2 = max(x(~i2));
y = x;
y(i1) = v1;
y(i2) = v2;
if nargout > 1
   varargout(1) = {i1 | i2};
end   

function x=standard(y)
T=size(y,1);
%N=size(y,2);
my=repmat(mean(y),T,1);
sy=repmat(std(y),T,1);
x=(y-my)./sy;