3.0_caa.Rmd

---
title: "Catch-at-age"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: FALSE
    code_folding: show
    number_sections: TRUE
---
    
# SET-UP
```{r  markdown, echo=F}
library(knitr)
opts_chunk$set(echo = T, collapse = T, fig.align = "center", fig.width = 9, fig.height = 6)
options(width = 140)
```    

## settings
```{r settings, message = F}
source('0.0_settings.R')
```

## load all data
```{r data, message = F}
load(paste0(dir.rdat, "catch.Rdata"))                                # from 1.0 read
load(paste0(dir.rdat, "bio.Rdata"))                                  # from 1.0 read
load(paste0(dir.rdat, "lf.Rdata"))                                   # from 1.0 read
group.region <- read.table(paste0(dir.dat,'caa_group_region.txt'),header=T)
group.gear <- read.table(paste0(dir.dat,'caa_group_gear.txt'),header=T)
```

# LENGTH-WEIGHT RELATIONSHIPS 

Sample weights are often missing and apparently are not precise. Therefore, the weight of a sample is estimated based on the length of the individuals and their number sampled. This section fits length-weight relationships for all years and periods. Bio samples are used ('carbio').

```{r lw, message = F}

## subset
lw <- bio[!is.na(bio$length) & !is.na(bio$weight),c('year','trim','length','weight')]
lw$logw <- log(lw$weight)           
lw$logl <- log(lw$length)          

## remove extremes
# ggplot(lw,aes(x=logl,y=logw))+geom_point()
lw <- lw[lw$logw>0 & lw$logl>3,]                         # minimal weight and length 

## relationships by year and period (quarter) (!robuust regression instead of removing subjectively some points each year!)
lw.modR <- paste0(dir.rdat, "lw.mod.Rdata")

if(!file.exists(lw.modR)){
    lw.mod <- with(lw,fit.lw(logl,logw,year,trim,tresh=300,period.unit='quarter'))
    save(lw.mod, file =  lw.modR)
}else{ 
    load(lw.modR)
}
```

## CHECK {.tabset}
### Prediction
```{r lw_fit, message=FALSE}
expand.grid.df <- function(...) type.convert(Reduce(function(...) merge(..., by=NULL), list(...)))
check <- expand.grid.df(lw.mod,data.frame(length=0:ceiling(max(lf$length))))
check$weight <- with(check,exp(intercept+slope*log(length))*cf)
ggplot(check,aes(x=length,y=weight,col=year,group=year))+
    geom_line(size=0.5)+
    facet_wrap(~period)+
    scale_y_continuous(limits=c(0,max(lw$weight)))+
    scale_color_viridis_c()+
    labs(title='Predicted length-weight relationships')
```


# LENGTH-FREQUENCY DATA
```{r lf, message = F, fig.height = 3,fig.width = 4}
lw.caaR <- paste0(dir.rdat, "lw.caa.Rdata")
if(!file.exists(lw.caaR)){
    ## subset
    lf.caa <- lf[!is.na(lf$length),c('year','trim','nafo','gear.cat','length','n','sample.id')]
    
    ## group regions and gears
    # PERIOD: quarter
    names(lf.caa)[2] <- 'period'
    
    # REGION: large scale regions 
    lf.caa$region <- group.region[match(lf.caa$nafo,group.region$nafo),'region']
    lf.caa$nafo <- NULL
    
    # GEAR: similar selectivity gears
    lf.caa$gear <- group.gear[match(lf.caa$gear.cat,group.gear$gear.cat),'gear.group']
    lf.caa$gear.cat <- NULL
    
    ## Append length-frequencies from carbio
    lf.bio <- bio[!is.na(bio$length) & bio$random==TRUE,c('year','trim','nafo','gear','length','sample.id')]
    lf.bio$sample.id <- -lf.bio$sample.id   # negative numbers for these samples for easy tracking
    
    # group regions and gears
    names(lf.bio)[2] <- 'period'
    lf.bio$region <- group.region[match(lf.bio$nafo,group.region$nafo),'region']
    lf.bio$nafo <- NULL
    lf.bio$gear <- group.gear[match(lf.bio$gear,group.gear$gear.cat),'gear.group']
    
    table(lf.bio$period,useNA = 'always')
    table(lf.bio$region,useNA = 'always')
    table(lf.bio$gear,useNA = 'always')
                    
    lf.bio <- ddply(lf.bio,c('year','period','region','gear','length','sample.id'),summarise,n=length(year))
    
    lf.caa <- rbind(lf.caa,lf.bio)
    
    ## convert lengths to sample weights (kg)
    lf.caa <- merge(lf.caa,lw.mod,all.x=T)
    lf.caa$weight.unit <- with(lf.caa,exp(intercept+slope*log(length))*cf/1000)
    
    ## make equal length classes
    table(lf$length.bin)
    lf.caa$length <-roundFish(lf.caa$length,5) 
    
    ## save
    save(lf.caa, file =  paste0(dir.rdat, "lf.caa.Rdata"))
}else{ 
    load(lw.caaR)
}
```

## CHECK {.tabset}
### model samples
```{r lw_table}
check <- unique(lf.caa[,c('year','period','option')])
kable(with(check,table(option,period)))
```

### predictions
```{r lw_predictions, fig.height = 3,fig.width = 10}
grid.arrange(
  ggplot(lf.caa,aes(x=weight.unit))+geom_histogram(binwidth = 0.1)+labs(title='Predicted unit weights',x='Weight (kg)'),
  ggplot(lf.caa,aes(x=weight.unit*n))+geom_histogram(binwidth = 1)+labs(title='Predicted sample weights',x='Weight (kg)'),
  ncol=2
)
```

## TABLES {.tabset}
### overview
```{r lf_tab_overview, message = F}
df <- lf.caa
df$id <- with(df,paste(sample.id,year,period,region,gear),collapse='.')
lf.overall <- ddply(df,c('year'),summarise,N=length(unique(id)),n=length(id),mean=round(mean(table(id)),0),min=min(table(id)),max=max(table(id)))
kable(lf.overall)
```

### period
```{r lf_tab_period, message = F}
dfp <- ddply(df,c('year','period'),summarise,N=length(unique(id)),n=sum(n))
dfp <- dcast(dfp,year~period,value.var='N',fill = 0)
dfp$total <- rowSums(dfp[,-1])
kable(dfp)
```

### region
```{r lf_tab_region, message = F}
dfr <- ddply(df,c('year','region'),summarise,N=length(unique(id)),n=sum(n))
dfr <- dcast(dfr,year~region,value.var='N',fill = 0)
dfr$total <- rowSums(dfr[,-1])
kable(dfr)
```
### gear
```{r lf_tab_gear, message = F}
dfg <- ddply(df,c('year','gear'),summarise,N=length(unique(id)),n=sum(n))
dfg <- dcast(dfg,year~gear,value.var='N',fill = 0)
dfg$total <- rowSums(dfg[,-1])
kable(dfg)
```

## PLOTS {.tabset}

### period
```{r lf_samples_period, message = F}
dfp <- ddply(df,c('year','period'),summarise,N=length(unique(id)),n=sum(n))
grid.arrange(ggplot(dfp,aes(x=year,y=N,fill=period))+geom_bar(stat='identity')+scale_fill_viridis_c(),
             ggplot(dfp,aes(x=year,y=n,fill=period))+geom_bar(stat='identity')+scale_fill_viridis_c(),ncol=1)
```

### region
```{r lf_samples_region, message = F}
dfr <- ddply(df,c('year','region'),summarise,N=length(unique(id)),n=sum(n))
grid.arrange(ggplot(dfr,aes(x=year,y=N,fill=region))+geom_bar(stat='identity')+scale_fill_viridis_d(),
             ggplot(dfr,aes(x=year,y=n,fill=region))+geom_bar(stat='identity')+scale_fill_viridis_d(),ncol=1)

```

### gear
```{r lf_samples_gear, message = F}
dfg <- ddply(df,c('year','gear'),summarise,N=length(unique(id)),n=sum(n))
grid.arrange(ggplot(dfg,aes(x=year,y=N,fill=gear))+geom_bar(stat='identity')+scale_fill_viridis_d(),
             ggplot(dfg,aes(x=year,y=n,fill=gear))+geom_bar(stat='identity')+scale_fill_viridis_d(),ncol=1)

```

# BIO DATA
```{r al global, message = F, fig.height = 6,fig.width = 14}
## subset
bio.caa <- bio[!is.na(bio$agef) & !is.na(bio$length),
               c('year','trim','nafo','gear','length','agef','sample.id')]
names(bio.caa)[6] <- 'age'

## remove absurd ones
# ages too high
table(bio.caa$age)
bio.caa <- bio.caa[bio.caa$age<18,]    # 4 out of 6 fish have wrong length, unsure about remaining two

# length not matching
bio.caa <- ddply(bio.caa,c('age'),transform,outlier=outlier(length,coef=3)) # rule of thumb extremes
bio.caa[bio.caa$age==0 & bio.caa$length>300,'outlier'] <- TRUE              # odd second mod
table(bio.caa$outlier,bio.caa$age)

# check
grid.arrange(
  ggplot(bio.caa,aes(x=length,fill=outlier))+
    geom_histogram()+
    facet_wrap(~age,scale='free'),
  ggplot(bio.caa,aes(x=age,y=length,group=age))+
    geom_boxplot()+
    geom_point(data=bio.caa[bio.caa$outlier==T,],aes(col=outlier)),
  ncol=2,widths=c(2,1)
)
bio.caa <- bio.caa[bio.caa$outlier==FALSE,]
bio.caa$outlier <- NULL

## group regions and gears
# PERIOD: quarter
names(bio.caa)[2] <- 'period'
table(bio.caa$period,useNA = 'always')

# REGION: large scale regions 
bio.caa$region <- group.region[match(bio.caa$nafo,group.region$nafo),'region']
bio.caa$nafo <- NULL
table(bio.caa$region,useNA = 'always')

# GEAR: similar selectivity gears
bio.caa$gear <- group.gear[match(bio.caa$gear,group.gear$gear.cat),'gear.group']
table(bio.caa$gear,useNA = 'always')

## make equal length classes
bio.caa$length <- roundFish(bio.caa$length,5) 
bio.caa <- bio.caa[bio.caa$length>0,]
range(bio.caa$length)

save(bio.caa, file =  paste0(dir.rdat, "bio.caa.Rdata"))
```

## TABLES {.tabset}

### overview
```{r al_tab_global, message = F}
df <- bio.caa
df$id <- with(df,paste(sample.id,year,period,region,gear),collapse='.')
al.overall <- ddply(df,c('year'),summarise,N=length(unique(id)),n=length(id),mean=mean(table(id)))
kable(al.overall)
```

### period
```{r al_tab_period, message = F}
dfp <- ddply(df,c('year','period'),summarise,N=length(unique(id)),n=sum(n))
dfp <- dcast(dfp,year~period,value.var='N',fill = 0)
dfp$total <- rowSums(dfp[,-1])
kable(dfp)
```

### region
```{r al_tab_region, message = F}
dfr <- ddply(df,c('year','region'),summarise,N=length(unique(id)),n=sum(n))
dfr <- dcast(dfr,year~region,value.var='N',fill = 0)
dfr$total <- rowSums(dfr[,-1])
kable(dfr)
```

### gear
```{r al_tab_gear, message = F}
dfg <- ddply(df,c('year','gear'),summarise,N=length(unique(id)),n=sum(n))
dfg <- dcast(dfg,year~gear,value.var='N',fill = 0)
dfg$total <- rowSums(dfg[,-1])
kable(dfg)
```


## PLOTS {.tabset}
```{r al_samples, message = F}
df <- bio.caa
df$id <- with(df,paste(sample.id,year,period,region,gear),collapse='.')
```

### period
```{r al_samples_period, message = F}
dfp <- ddply(df,c('year','period'),summarise,N=length(unique(id)),n=length(id))
grid.arrange(ggplot(dfp,aes(x=year,y=N,fill=period))+geom_bar(stat='identity')+scale_fill_viridis_c(),
             ggplot(dfp,aes(x=year,y=n,fill=period))+geom_bar(stat='identity')+scale_fill_viridis_c(),ncol=1)
```

### region
```{r al_samples_region, message = F}
dfr <- ddply(df,c('year','region'),summarise,N=length(unique(id)),n=length(id))
grid.arrange(ggplot(dfr,aes(x=year,y=N,fill=region))+geom_bar(stat='identity')+scale_fill_viridis_d(),
             ggplot(dfr,aes(x=year,y=n,fill=region))+geom_bar(stat='identity')+scale_fill_viridis_d(),ncol=1)
```

### gear
```{r al_samples_gear, message = F}
dfg <- ddply(df,c('year','gear'),summarise,N=length(unique(id)),n=length(id))
grid.arrange(ggplot(dfg,aes(x=year,y=N,fill=gear))+geom_bar(stat='identity')+scale_fill_viridis_d(),
             ggplot(dfg,aes(x=year,y=n,fill=gear))+geom_bar(stat='identity')+scale_fill_viridis_d(),ncol=1)
```

# CATCH DATA
```{r catch split, message = F, fig.height = 8,fig.width = 6}
catch <- catch[,c('year','month','nafo','gear.cat','source','catch')]

## identify period, region and gear
# 1) PERIOD: quarters
catch$period <- ceiling(catch$month/3)

# 2) REGION: large scale regions 
catch$region <- group.region[match(catch$nafo,group.region$nafo),'region']

# 3) GEAR: similar selectivity gears
catch$gear <- group.gear[match(catch$gear.cat,group.gear$gear.cat),'gear.group']

## total catch by level
catch.caa <- ddply(catch,c('year','region','period','gear'),summarise,catch=sum(catch,na.rm = T))
catch.caa <- catch.caa[catch.caa$catch>0,]

save(catch.caa, file =  paste0(dir.rdat, "catch.caa.Rdata"))
```

## PLOTS {.tabset}

### region
```{r catch_region, message = F}
plotCatch(catch.caa,x='year',y='catch',fill='region')+scale_fill_viridis_d(na.value = "grey50")
```

### gear
```{r catch_gear, message = F}
plotCatch(catch.caa,x='year',y='catch',fill='gear')+scale_fill_viridis_d(na.value = "grey50")
```

### period
```{r catch_period, message = F}
plotCatch(catch.caa,x='year',y='catch',fill='period')+scale_fill_viridis_d(na.value = "grey50")
```


# CATCH-AT-AGE
```{r caa, message = F}
catch.diffusedR <- paste0(dir.rdat, "catch.diffused.Rdata")
caaR <- paste0(dir.rdat, "caa.Rdata")
  
if(!all(file.exists(c(catch.diffusedR,caaR)))){
    # step 1) get samples
    lf.caa[,which(names(lf.caa) %in% names(lw.mod)[-c(1:2)])] <- NULL   # remove model details
    
    years <- my.year[my.year>=1976]                           # no lf prior to this year
    catch.caa <- catch.caa[catch.caa$year %in% years,]        # years for which lf
    lf.caa <- lf.caa[lf.caa$year %in% years,]                 # years for which lf
    bio.caa <- bio.caa[bio.caa$year %in% years,]              # years for which lf
    
    catch.diffused <- get.samples(catch=catch.caa,
                       lf=lf.caa, 
                       al=bio.caa,
                       min.lf.samples =2,          # min samples for length-frequency
                       min.al.samples =2,          # min samples for age-length key
                       min.al.fish = 5,
                       period.unit='quarter', # quarterly grouping instead of monthly
                       subsample = TRUE,     # age length keys are stratied subsamples, not necessarily matching the lf samples
                       prob.al=0.75)        # max probability with which for a given length an age class might be missing in stratum specific  ALK

    catch.diffused$age.0 <- NULL                # remove age 0 (total landings somewhat smaller)
    save(catch.diffused, file =  catch.diffusedR)
    
    # step 2) get actual caa
    caa <- get.caa(x=catch.diffused,plus=10)
    save(caa, file =  caaR)     
}else{ 
    load(catch.diffusedR)
    load(caaR)
}

```

## CHECK {.tabset}
### samples
```{r caa_check_samples, fig.height = 6,fig.width = 8}
check <- ddply(catch.diffused,c('id','year','catch'),summarise,
               option.lengthfreq=unique(option.lengthfreq),
               nsample.lengthfreq=unique(nsample.lengthfreq),
               nfish.lengthfreq=sum(n.lf),
               option.agelength=unique(option.agelength),
               nsample.agelength=unique(nsample.agelength),
               nfish.agekey=sum(n.al))
check <- reshape::melt(check,id=c('id','year','catch'))
ggplot(check,aes(x=value,fill=catch))+geom_histogram(bins=30,)+facet_wrap(~variable,scale='free')
```

### samples annual
```{r caa_check_samples_annual, fig.height = 5,fig.width = 12}
ggplot(check,aes(x=year,y=value,group=year))+geom_boxplot()+facet_wrap(~variable,scale='free_y')
```

### total
```{r caa_check_total, fig.height = 6,fig.width = 8}
tot <- ddply(catch.caa,c('year'),summarise,catch=sum(catch))
ggplot(caa,aes(x=year))+
  geom_bar(stat='identity',aes(y=caaw,fill=age))+
  geom_line(data=tot,aes(y=catch))+
  scale_fill_viridis()
```

## PLOTS {.tabset}
### CAAN
```{r caan_plot, message = F, fig.height = 3,fig.width = 10}
ggplot(caa,aes(x=year,y=age))+
  geom_point(alpha=0.8,aes(size=caan))+ 
  scale_size(range = c(1,12))+
  scale_y_continuous(breaks=min(caa$age):max(caa$age))
```

### CAAW
```{r caaw_plot, message = F, fig.height = 3,fig.width = 10}
ggplot(caa,aes(x=year,y=age))+
  geom_point(alpha=0.8,aes(size=caaw))+
  scale_size(range = c(1,12))+
  scale_y_continuous(breaks=min(caa$age):max(caa$age))
```

### WAA
The large dip in 1984 is an artifact because of a mis-match between the length and bio samples. When a relatively large fraction of the catch has a length-frequency that includes small fish but no such small fish are present in the selected bio samples (ALK), the current gap filling algorithm classifies them, possibly wrongly, as the youngest age in the ALK (e.g., age 4). 

This also biases CAA.

A rule would need to be added to look for such mis-matches (any class could be 'missing') and to apply a correction. 
See the hybrid approach from Ailloud 2019. For instance, make overall ALK with sd and if there is x% probability (CI from ALKglobal) the length is in an age class that was not in the ALK, add more samples or so.

I expect this might also be why I overestimate CAA of age 1 relative to CATCH.EXE?

```{r waa_plot, message = F, fig.height = 4,fig.width = 10}
ggplot(caa,aes(x=year,y=waa,group=age))+
    geom_line(aes(color=age),size=1)+
    scale_color_viridis()
```

### WAA.ci
```{r waa_plot_ci, message = F, fig.height = 6,fig.width = 10}
ggplot(caa,aes(x=year,y=waa,group=age))+
    facet_wrap(~age,scale='free')+
    geom_ribbon(aes(ymin=waa-2*waa.sd,ymax=waa+2*waa.sd),alpha=0.6)+
    geom_line(size=1)
```

### LAA
```{r laa_plot, message = F, fig.height = 4,fig.width = 10}
ggplot(caa,aes(x=year,y=laa,group=age))+
    geom_line(aes(color=age),size=1)+
    scale_color_viridis()
```

### LAA.ci
```{r laa_plot, message = F, fig.height = 6,fig.width = 10}
ggplot(caa,aes(x=year,y=laa,group=age))+
    facet_wrap(~age,scale='free')+
    geom_ribbon(aes(ymin=laa-2*laa.sd,ymax=laa+2*laa.sd),alpha=0.6)+
    geom_line(size=1)
```