From 8ecdab68587c07cf88ebc0b8a8f2bd0e1a703e37 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Thu, 29 Sep 2022 08:53:40 +1000 Subject: init --- .gitignore | 1 + default.nix | 23 ++++++++++++++ flake.lock | 42 +++++++++++++++++++++++++ flake.nix | 8 +++++ src/MaveDB.hs | 58 ++++++++++++++++++++++++++++++++++ src/PAVA.hs | 34 ++++++++++++++++++++ src/Uniprot.hs | 26 ++++++++++++++++ src/scrape.hs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 290 insertions(+) create mode 100644 .gitignore create mode 100644 default.nix create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 src/MaveDB.hs create mode 100644 src/PAVA.hs create mode 100644 src/Uniprot.hs create mode 100644 src/scrape.hs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fcfc4a1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +result* diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..0a9f8b6 --- /dev/null +++ b/default.nix @@ -0,0 +1,23 @@ +{pkgs ? import {}}: +with pkgs; let + ghc = haskellPackages.ghcWithPackages (ps: + with ps; [ + lens-aeson + wreq + sqlite-simple + PyF + optparse-applicative + ]); +in + stdenv.mkDerivation { + name = "mavedb-scraper"; + src = ./src; + + buildInputs = [ghc]; + + buildPhase = '' + ghc -o scrape *.hs -O -rtsopts -threaded + ''; + + installPhase = "install -Dm755 ./scrape $out/bin/scrape"; + } diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..f22c045 --- /dev/null +++ b/flake.lock @@ -0,0 +1,42 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1659877975, + "narHash": "sha256-zllb8aq3YO3h8B/U0/J1WBgAL8EX5yWf5pMj3G0NAmc=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "c0e246b9b83f637f4681389ecabcb2681b4f3af0", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1664405354, + "narHash": "sha256-kGTmqe/6GTfP2dbvYSH2MnoGSTwY4C7SL7ofcYf3/N0=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "37280688cb4221215696003baf8797858c8c546a", + "type": "github" + }, + "original": { + "owner": "nixos", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..37741f3 --- /dev/null +++ b/flake.nix @@ -0,0 +1,8 @@ +{ +inputs.nixpkgs.url = "github:nixos/nixpkgs"; +inputs.flake-utils.url = "github:numtide/flake-utils"; +outputs = {self, nixpkgs, flake-utils}: + flake-utils.lib.eachDefaultSystem (system: let pkgs = import nixpkgs {inherit system;}; in { + defaultPackage = import ./default.nix { inherit pkgs; }; + }); + } diff --git a/src/MaveDB.hs b/src/MaveDB.hs new file mode 100644 index 0000000..b662231 --- /dev/null +++ b/src/MaveDB.hs @@ -0,0 +1,58 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} + +module MaveDB where + +import Control.Lens +import Data.Aeson.Lens +import qualified Data.ByteString.Lazy.Char8 as BS +import qualified Data.Map.Strict as M +import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T +import Network.HTTP.Client (managerResponseTimeout, responseTimeoutNone) +import Network.HTTP.Client.TLS (tlsManagerSettings) +import Network.Wreq +import Uniprot (UniprotID) + +type URN = Text + +type Gene = Text + +type HGVSP = Text + +type Score = (HGVSP, Double) + +-- Standard options for requests +opts = defaults & manager .~ Left (tlsManagerSettings {managerResponseTimeout = responseTimeoutNone}) + +--opts = defaults & manager .~ Left tlsManagerSettings + +-- Retrieve a list of URNs which are protein coding human genes +queryURNs :: IO [(URN, UniprotID)] +queryURNs = do + rep <- getWith opts "https://mavedb.org/api/scoresets/" + pure $ + rep ^. responseBody + & (^.. traverse . runFold ((,) <$> Fold (key "urn" . _String) <*> Fold (key "target" . key "uniprot" . key "identifier" . _String))) + . (^.. folded . filtered (allOf (key "target" . key "reference_maps" . _Array . traverse . key "genome" . key "short_name" . _String) (== "hg38"))) + . (^.. folded . filtered (allOf (key "target" . key "type" . _String) (== "Protein coding"))) + . (^.. _Array . folded . filtered (allOf (key "target" . key "score_columns" . _Array) (\((^.. traverse . _String) -> scs) -> "hgvs_pro" `elem` scs && "score" `elem` scs))) + +getScores :: URN -> IO [Score] +getScores urn = do + let urn' = T.unpack urn + rep <- getWith opts $ "https://mavedb.org/scoreset/" <> urn' <> "/scores/" + let hdr : rows = dropComments . BS.lines . BS.filter (/= '\r') $ rep ^. responseBody + rows' = map (M.fromList . zip (BS.split ',' hdr) . BS.split ',') rows + pure $ + rows' + ^.. traverse + . runFold + ( (,) + <$> Fold (at "hgvs_pro" . to (fromMaybe $ error ("no hgvs_pro column in " <> urn')) . to (T.pack . BS.unpack)) + <*> Fold (at "score" . traverse . _Double) + ) + where + dropComments (a : as) = if BS.head a == '#' then dropComments as else a : as + dropComments x = x diff --git a/src/PAVA.hs b/src/PAVA.hs new file mode 100644 index 0000000..35ea4ec --- /dev/null +++ b/src/PAVA.hs @@ -0,0 +1,34 @@ +module PAVA where + +data Block a = Block ([a] -> a) [a] + +instance (Fractional a, Eq a) => Eq (Block a) where + x == y = mu x == mu y + +instance (Fractional a, Ord a) => Ord (Block a) where + x <= y = mu x <= mu y + +mu :: Fractional a => Block a -> a +mu (Block f xs) = f xs + +mean :: Fractional a => [a] -> a +mean = (\(a, b) -> b / a) . foldl (\(a, b) c -> (a + 1, b + c)) (0, 0) + +pava' :: (Show a, Eq a, Ord a, Fractional a) => ([a] -> a) -> [a] -> [a] +pava' f = concatMap unblock . untilFixed fixViol . map (Block f . pure) + where + untilFixed f x = + let y = f x + in if y == x then y else untilFixed f y + + fixViol (a : b : xs) + | a > b = merge a b : xs + | otherwise = a : fixViol (b : xs) + fixViol x = x + + merge (Block _ a) (Block _ b) = Block f (a <> b) + +pava = pava' mean + +unblock :: Fractional a => Block a -> [a] +unblock (Block f xs) = replicate (length xs) (f xs) diff --git a/src/Uniprot.hs b/src/Uniprot.hs new file mode 100644 index 0000000..9458cfb --- /dev/null +++ b/src/Uniprot.hs @@ -0,0 +1,26 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Uniprot where + +import Codec.Compression.GZip +import Control.Lens +import Data.Aeson.Lens +import qualified Data.ByteString.Char8 as BSS +import qualified Data.ByteString.Lazy.Char8 as BS +import Data.Text (Text) +import qualified Data.Text as T +import Network.Wreq + +type UniprotID = Text + +type GeneName = Text + +geneName :: UniprotID -> IO GeneName +geneName id = do + rep <- getWith (defaults & header "Accept" .~ ["application/json"]) $ "https://rest.uniprot.org/uniprotkb/" <> T.unpack id + let encoding = rep ^. responseHeader "content-encoding" + case encoding of + "gzip" -> + let json = decompress $ rep ^. responseBody + in pure $ json ^. key "genes" . _Array . traverse . key "geneName" . key "value" . _String + _ -> error $ "unkown content encoding " <> BSS.unpack encoding diff --git a/src/scrape.hs b/src/scrape.hs new file mode 100644 index 0000000..7a5a280 --- /dev/null +++ b/src/scrape.hs @@ -0,0 +1,98 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} + +module Main where + +import Control.Lens +import Data.List +import qualified Data.Map as M +import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Database.SQLite.Simple +import Database.SQLite3 as S3 hiding (Statement) +import MaveDB +import Options.Applicative +import PAVA +import PyF +import Uniprot + +schema = + [fmt| + PRAGMA journal_mode = OFF; + PRAGMA synchronous = OFF; + PRAGMA temp_store = memory; + PRAGMA mmap_size = 30000000000; + PRAGMA page_size = 32768; + + CREATE TABLE IF NOT EXISTS dms(gene TEXT NOT NULL, hgvs_p TEXT NOT NULL, pnonsense REAL NOT NULL, + PRIMARY KEY (gene, hgvs_p)); + |] + +newtype Config = Config FilePath + +main = configured =<< execParser opts + where + config = + Config + <$> strArgument (metavar "DB" <> help "path to SQLite DB to insert into") + + opts = + info + (config <**> helper) + (fullDesc <> progDesc "scrapes MaveDB for human proteins, normalises, and inserts HGVS notation into a SQLite DB" <> header "scrape -- MaveDB scraper") + +configured (Config out) = do + urns <- queryURNs + scores <- + M.fromListWith (M.unionWith (++)) . catMaybes + <$> mapM + ( \(x, u) -> do + g <- geneName u + T.putStrLn g + s <- normalise <$> getScores x + pure $ (g,) <$> s + ) + urns + + let cScores = M.map (M.toList . M.map geomean) scores + + withConnection out $ \conn@Connection {..} -> do + S3.exec connectionHandle (fromQuery schema) + withTransaction conn . mapM_ (insertGene conn) $ M.toList cScores + +geomean :: [Double] -> Double +geomean = exp . mean . map log + +insertGene conn (gene, muts) = do + mapM_ (insertMut conn gene) muts + +insertMut conn gene (mut, score) = + execute conn "insert or ignore into dms values (?,?,?)" (gene, mut, 1 - score) + +normalise :: [(Text, Double)] -> Maybe (M.Map Text [Double]) +normalise xs = + let ord = sort [(s, (h, notTer h)) | (h, s) <- xs] + (hgvs, y) = unzip $ map snd ord + y' = pava y + in M.fromList . zip hgvs . map pure <$> puCorrection hgvs y' + +notTer str = if (T.reverse . T.take 3 $ T.reverse str) == "Ter" then 0 else 1 :: Double + +-- Assumes ∈ {0,1} +weightedMean :: (Foldable f, Fractional a) => a -> a -> f a -> a +weightedMean w0 w1 xs = + let n1 = sum xs + n0 = fromIntegral (length xs) - n1 + in w1 * n1 / (w1 * n1 + w0 * n0) + +puCorrection hgvs ys = + let lab = map notTer hgvs + in M.fromListWith (++) (zip lab $ map pure ys) ^. at 1 <&> \nons -> + let cf = 1 - mean nons -- NB: reversed due to dms score being lower when Ter + w0 = 2 / cf + w1 = 1 / mean lab + in pava' (weightedMean w0 w1) lab -- cgit v1.2.3