{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} module Main where import Control.Lens import Control.Monad import Data.Aeson.Lens import Data.List import qualified Data.Map as M import Data.Maybe import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import Database.SQLite.Simple import Database.SQLite3 as S3 hiding (Statement) import MaveDB import Options.Applicative import PAVA import PyF import Uniprot schema = [fmt| PRAGMA journal_mode = OFF; PRAGMA synchronous = OFF; PRAGMA temp_store = memory; PRAGMA mmap_size = 30000000000; PRAGMA page_size = 32768; CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL, PRIMARY KEY (gene, hgvs_p)); |] data Config = Config FilePath Double (Maybe FilePath) main = configured =<< execParser opts where config = Config <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") <*> option auto ( long "min-pi" <> short 'p' <> metavar "π" <> help "minimum estimated labelling proportion" <> showDefault <> value 0.1 ) <*> option auto ( long "extra" <> short 'e' <> metavar "PATH" <> help "path to JSON map between URNs and uniprot ids to supplement MaveDB URN query" <> value Nothing ) opts = info (config <**> helper) (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") configured (Config out minpi uniprotMap) = do urns <- queryURNs extraUrns <- forM uniprotMap $ \path -> do json <- T.readFile path return $ zip (json ^.. members . asIndex) (json ^.. members . _String) scores <- M.fromListWith (M.unionWith (++)) . catMaybes <$> mapM ( \(x, u) -> do g <- geneName u T.putStrLn g s <- normalise minpi <$> getScores x pure $ (g,) <$> s ) (urns ++ fromMaybe mempty extraUrns) let cScores = M.map (M.toList . M.map geomean) scores withConnection out $ \conn@Connection {..} -> do S3.exec connectionHandle (fromQuery schema) withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores geomean :: [Double] -> Double geomean = exp . mean . map log insertGene conn (gene, muts) = do mapM_ (insertMut conn gene) muts insertMut conn gene (mut, score) = execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double]) normalise minpi xs = let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] (hgvs, y) = unzip $ map snd ord y' = pava y in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y' notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double -- Assumes ∈ {0,1} weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a weightedMean w0 w1 xs = let n1 = sum xs n0 = fromIntegral (length xs) - n1 in w1 * n1 / (w1 * n1 + w0 * n0) puCorrection minpi lab ys = M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc -> let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter w0 = 2 / cf w1 = 1 / mean lab in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab