summaryrefslogtreecommitdiff
path: root/src/scrape.hs
blob: 4b10fbd9560382493244861680757de03cd6883d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}

module Main where

import Control.Lens
import Control.Monad
import Data.Aeson.Lens
import Data.List
import qualified Data.Map as M
import Data.Maybe
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Database.SQLite.Simple
import Database.SQLite3 as S3 hiding (Statement)
import MaveDB
import Options.Applicative
import PAVA
import PyF
import Uniprot

schema =
  [fmt|
    PRAGMA journal_mode = OFF;
    PRAGMA synchronous = OFF;
    PRAGMA temp_store = memory;
    PRAGMA mmap_size = 30000000000;
    PRAGMA page_size = 32768;

    CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL,
      PRIMARY KEY (gene, hgvs_p));
  |]

data Config = Config FilePath Double (Maybe FilePath)

main = configured =<< execParser opts
  where
    config =
      Config
        <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into")
          <*> option
            auto
            ( long "min-pi"
                <> short 'p'
                <> metavar "π"
                <> help "minimum estimated labelling proportion"
                <> showDefault
                <> value 0.1
            )
          <*> optional (strOption
          ( long "extra"
              <> short 'e'
              <> metavar "PATH"
              <> help "path to JSON map between URNs and uniprot ids to supplement MaveDB URN query"
          ))

    opts =
      info
        (config <**> helper)
        (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper")

configured (Config out minpi uniprotMap) = do
  urns <- queryURNs
  extraUrns <- forM uniprotMap $ \path -> do
    json <- T.readFile path
    return $ zip (json ^.. members . asIndex) (json ^.. members . _String)
  scores <-
    M.fromListWith (M.unionWith (++)) . catMaybes
      <$> mapM
        ( \(x, u) -> do
            g <- geneName u
            T.putStrLn g
            s <- normalise minpi <$> getScores x
            pure $ (g,) <$> s
        )
        (urns ++ fromMaybe mempty extraUrns)

  let cScores = M.map (M.toList . M.map geomean) scores

  withConnection out $ \conn@Connection {..} -> do
    S3.exec connectionHandle (fromQuery schema)
    withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores

geomean :: [Double] -> Double
geomean = exp . mean . map log

insertGene conn (gene, muts) = do
  mapM_ (insertMut conn gene) muts

insertMut conn gene (mut, score) =
  execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score)

normalise :: Double -> [(Text, Double)] -> Maybe (M.Map Text [Double])
normalise minpi xs =
  let ord = sort [(s, (h, notTer h)) | (h, s) <- xs]
      (hgvs, y) = unzip $ map snd ord
      y' = pava y
   in M.fromList . zip hgvs . map pure <$> puCorrection minpi y y'

notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double

-- Assumes ∈ {0,1}
weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a
weightedMean w0 w1 xs =
  let n1 = sum xs
      n0 = fromIntegral (length xs) - n1
   in w1 * n1 / (w1 * n1 + w0 * n0)

puCorrection minpi lab ys =
  M.fromListWith (++) (zip lab $ map pure ys) ^. at 0 >>= \nonsc ->
    let cf = 1 - mean nonsc -- NB: reversed due to dms score being lower when Ter
        w0 = 2 / cf
        w1 = 1 / mean lab
     in if cf < minpi then Nothing else Just $ pava' (weightedMean w0 w1) lab