1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
module Main where
import Control.Lens
import Data.List
import qualified Data.Map as M
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Database.SQLite.Simple
import Database.SQLite3 as S3 hiding (Statement)
import MaveDB
import Options.Applicative
import PAVA
import PyF
import Uniprot
schema =
[fmt|
PRAGMA journal_mode = OFF;
PRAGMA synchronous = OFF;
PRAGMA temp_store = memory;
PRAGMA mmap_size = 30000000000;
PRAGMA page_size = 32768;
CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL,
PRIMARY KEY (gene, hgvs_p));
|]
data Config = Config FilePath Double
main = configured =<< execParser opts
where
config =
Config
<$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into")
<*> option
auto
( long "min-pi"
<> short 'p'
<> metavar "π"
<> help "minimum estimated labelling proportion"
<> showDefault
<> value 0.1
)
opts =
info
(config <**> helper)
(fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper")
configured (Config out minpi) = do
urns <- queryURNs
scores <-
M.fromListWith (M.unionWith (++)) . catMaybes
<$> mapM
( \(x, u) -> do
g <- geneName u
T.putStrLn g
s <- normalise minpi <$> getScores x
pure $ (g,) <$> s
)
urns
let cScores = M.map (M.toList . M.map geomean) scores
withConnection out $ \conn@Connection {..} -> do
S3.exec connectionHandle (fromQuery schema)
withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores
geomean :: [Double] -> Double
geomean = exp . mean . map log
insertGene conn (gene, muts) = do
mapM_ (insertMut conn gene) muts
insertMut conn gene (mut, score) =
execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score)
normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double])
normalise minpi xs =
let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]
(hgvs, y) = unzip $ map snd ord
y' = pava y
in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y'
notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double
-- Assumes ∈ {0,1}
weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a
weightedMean w0 w1 xs =
let n1 = sum xs
n0 = fromIntegral (length xs) - n1
in w1 * n1 / (w1 * n1 + w0 * n0)
puCorrection minpi lab ys =
M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc ->
let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter
w0 = 2 / cf
w1 = 1 / mean lab
in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab
|