{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} module Main where import Control.Lens import Data.List import qualified Data.Map as M import Data.Maybe import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import Database.SQLite.Simple import Database.SQLite3 as S3 hiding (Statement) import MaveDB import Options.Applicative import PAVA import PyF import Uniprot schema = [fmt| PRAGMA journal_mode = OFF; PRAGMA synchronous = OFF; PRAGMA temp_store = memory; PRAGMA mmap_size = 30000000000; PRAGMA page_size = 32768; CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL, PRIMARY KEY (gene, hgvs_p)); |] newtype Config = Config FilePath main = configured =<< execParser opts where config = Config <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") opts = info (config <**> helper) (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") configured (Config out) = do urns <- queryURNs scores <- M.fromListWith (M.unionWith (++)) . catMaybes <$> mapM ( \(x, u) -> do g <- geneName u T.putStrLn g s <- normalise <$> getScores x pure $ (g,) <$> s ) urns let cScores = M.map (M.toList . M.map geomean) scores withConnection out $ \conn@Connection {..} -> do S3.exec connectionHandle (fromQuery schema) withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores geomean :: [Double] -> Double geomean = exp . mean . map log insertGene conn (gene, muts) = do mapM_ (insertMut conn gene) muts insertMut conn gene (mut, score) = execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) normalise xs = let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] (hgvs, y) = unzip $ map snd ord y' = pava y in M.fromList . zip hgvs . map pure <$> puCorrection y y' notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double -- Assumes ∈ {0,1} weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a weightedMean w0 w1 xs = let n1 = sum xs n0 = fromIntegral (length xs) - n1 in w1 * n1 / (w1 * n1 + w0 * n0) puCorrection lab ys = M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 <&> \nonsc -> let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter w0 = 2 / cf w1 = 1 / mean lab in pava' (weightedMean w0 w1) lab