From 71fc907a499f3938b28c055b40af0b68f65a2516 Mon Sep 17 00:00:00 2001 From: abychkova <7148571+abychkova@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:29:29 +0600 Subject: [PATCH] version 0.1.5.6: YLAB2-2755: spaces in gb locus name (#76) --- ChangeLog.md | 13 +++--- package.yaml | 2 +- src/Bio/GB/Parser.hs | 33 +++++++------- src/Bio/MMTF/Decode/MessagePack.hs | 6 ++- src/Bio/PDB.hs | 10 ++--- src/Bio/PDB/BondRestoring.hs | 40 +++++++++-------- src/Bio/PDB/Functions.hs | 15 ++++--- src/Bio/PDB/Reader.hs | 2 +- src/Bio/Sequence/Functions/Marking.hs | 2 +- src/Bio/Sequence/Functions/Weight.hs | 4 +- src/Bio/Uniprot/Parser.hs | 29 ++++++++---- test/GB/pIntA-TRBV.gb | 8 ++-- test/GB/spaces_in_locus.gb | 20 +++++++++ test/GBParserSpec.hs | 64 +++++++++++++++------------ 14 files changed, 149 insertions(+), 99 deletions(-) create mode 100644 test/GB/spaces_in_locus.gb diff --git a/ChangeLog.md b/ChangeLog.md index 691e873..10636f8 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,9 @@ ## [Unreleased] +## [0.1.5.6] - 2024-12-18 +- Fix for gb-parser: spaces in name in LOCUS + ## [0.1.5.5] - 2024-05-16 - Add `5ROX` modification to Fasta parser. @@ -64,12 +67,12 @@ Added ASN hydrogen names sometimes set by Scho - Update dependency versions. ## [0.1.3.20] - 2021-06-04 -### Changed -- YLAB2-629: Fasta parser is now able to parse empty lines in the beginning. +### Changed +- YLAB2-629: Fasta parser is now able to parse empty lines in the beginning. ## [0.1.3.19] - 2021-04-30 -### Changed -- Exports and instances for Biosset. +### Changed +- Exports and instances for Biosset. ## [0.1.3.18] - 2021-03-09 ### Fixed @@ -110,7 +113,7 @@ Added ASN hydrogen names sometimes set by Scho ## [0.1.3.9] - 2020-10-27 ### Fixed -- FASTA parser can now parse empty lines with spaces. +- FASTA parser can now parse empty lines with spaces. ## [0.1.3.8] - 2020-10-22 ### Fixed diff --git a/package.yaml b/package.yaml index a467a6a..e073f40 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: cobot-io -version: 0.1.5.5 +version: 0.1.5.6 github: "biocad/cobot-io" license: BSD3 category: Bio diff --git a/src/Bio/GB/Parser.hs b/src/Bio/GB/Parser.hs index 3f3fc52..f3bb340 100644 --- a/src/Bio/GB/Parser.hs +++ b/src/Bio/GB/Parser.hs @@ -26,7 +26,7 @@ genBankP :: Parser GenBankSequence genBankP = GenBankSequence <$> (metaP "Meta parser") <*> (gbSeqP "GB sequence parser") - <* string "//" <* eolSpaceP + <* string "//" <* eolSpaceP -------------------------------------------------------------------------------- -- Block with meta-information. @@ -48,16 +48,19 @@ metaP = do locusP :: Parser Locus locusP = string "LOCUS" *> space *> (Locus - <$> textP <* space -- name + <$> nameP <* space -- name <*> decimal <* space <* string "bp" <* space -- sequence length <*> textP <* space -- molecule type - <*> optional formP <* space -- form of sequence - <*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division + <*> optional formP <* space -- form of sequence + <*> optional (pack <$> some (satisfy isUpper)) <* space -- GenBank division <*> textP -- modification date <* eolSpaceP) where textP = takeWhile1P Nothing $ not . isSpace + nameP :: Parser Text + nameP = textP <> (try (string " " <> nameP) <|> "") + formP :: Parser Form formP = try (string "linear" $> Linear) <|> (string "circular" $> Circular) @@ -108,7 +111,7 @@ commentP = string "COMMENT" *> (try emptyP <|> (many (char ' ') *> someLinesP)) -------------------------------------------------------------------------------- featuresP :: Parser [(Feature, Range)] -featuresP = -- skip unknown fields and stop on line with "FEATURES" +featuresP = -- skip unknown fields and stop on line with "FEATURES" manyTill (textWithSpacesP <* eolSpaceP) (string "FEATURES") *> space *> textWithSpacesP <* eolSpaceP *> some (featureP "Single feature parser") @@ -128,8 +131,8 @@ featureP = do pure (Feature featureName' props, shiftRange (-1) range) rangeP :: Parser Range -rangeP = try spanP - <|> try betweenP +rangeP = try spanP + <|> try betweenP <|> try pointP <|> try joinP <|> complementP @@ -141,8 +144,8 @@ rangeP = try spanP _ <- string ".." upperBorderType <- option Precise (try $ char '>' *> pure Exceeded) upperBorderLocation <- decimal - pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation) - + pure $ Span (RangeBorder lowerBorderType lowerBorderLocation) (RangeBorder upperBorderType upperBorderLocation) + betweenP :: Parser Range betweenP = do before <- decimal @@ -152,13 +155,13 @@ rangeP = try spanP pointP :: Parser Range pointP = fmap Point decimal - + joinP :: Parser Range joinP = string "join(" *> fmap Join (rangeP `sepBy1` char ',') <* char ')' complementP :: Parser Range complementP = fmap Complement $ string "complement(" *> rangeP <* char ')' - + propsP :: Parser (Text, Text) propsP = do @@ -178,17 +181,17 @@ propsP = do indLine = do _ <- string featureIndent2 notFollowedBy (char '/') - text <- textWithSpacesP + text <- textWithSpacesP eolSpaceP pure text multiLineProp :: Parser Text multiLineProp = do - fstText <- textWithSpacesP <* eolSpaceP + fstText <- textWithSpacesP <* eolSpaceP rest <- many (try indLine) - pure $ T.concat (fstText : rest) + pure $ T.concat (fstText : rest) + - -- | First level of identation in FEATURES table file. -- diff --git a/src/Bio/MMTF/Decode/MessagePack.hs b/src/Bio/MMTF/Decode/MessagePack.hs index 22b99ce..f302b60 100644 --- a/src/Bio/MMTF/Decode/MessagePack.hs +++ b/src/Bio/MMTF/Decode/MessagePack.hs @@ -43,7 +43,11 @@ asStr _ (ObjectStr s) = pure s asStr m _ = fail $ T.unpack m <> ": not a string data" asChar :: MonadFail m => Text -> Object -> m Char -asChar m = (head . T.unpack <$>) . asStr m +asChar txt obj = do + str <- asStr txt obj + case T.unpack str of + [] -> return ' ' + (c : _) -> return c asInt :: (MonadFail m, Integral a) => Text -> Object -> m a asInt _ (ObjectInt i) = pure (fromIntegral i) diff --git a/src/Bio/PDB.hs b/src/Bio/PDB.hs index 925c7ea..fe16b57 100644 --- a/src/Bio/PDB.hs +++ b/src/Bio/PDB.hs @@ -72,15 +72,13 @@ instance StructureModels PDB.PDB where mkResidue :: Map Text (Vector (Bond LocalID)) -> [PDB.Atom] -> Residue mkResidue _ [] = error "Cound not make residue from empty list" - mkResidue localBondsMap atoms' = Residue (T.strip $ PDB.atomResName firstResidueAtom) - (PDB.atomResSeq firstResidueAtom) - (PDB.atomICode firstResidueAtom) + mkResidue localBondsMap atoms'@(firstAtom : _) = Residue (T.strip $ PDB.atomResName firstAtom) + (PDB.atomResSeq firstAtom) + (PDB.atomICode firstAtom) (V.fromList $ mkAtom <$> atoms') - (localBondsMap M.!?! residueID firstResidueAtom) + (localBondsMap M.!?! residueID firstAtom) Undefined -- now we do not read secondary structure "" -- chemical component type?! - where - firstResidueAtom = head atoms' mkAtom :: PDB.Atom -> Atom mkAtom atom@PDB.Atom{..} = Atom (GlobalID $ atomToNilBasedIndex M.!?! atom) diff --git a/src/Bio/PDB/BondRestoring.hs b/src/Bio/PDB/BondRestoring.hs index fec3908..0f0287a 100644 --- a/src/Bio/PDB/BondRestoring.hs +++ b/src/Bio/PDB/BondRestoring.hs @@ -42,25 +42,28 @@ restoreChainLocalBonds' chainAtoms = residueIDToLocalBonds residueIDToLocalBonds = do (residueAtoms, residueBonds) <- zip chainAtomsGroupedByResidue intraResidueGlobalBonds let localBonds = V.fromList $ convertGlobalsToLocals residueAtoms residueBonds - let _residueID = residueID $ head residueAtoms + let _residueID = + case residueAtoms of + [] -> "" + (atom : _) -> residueID atom pure (_residueID, localBonds) - + intraResidueGlobalBonds :: [[Bond PDB.Atom]] intraResidueGlobalBonds = fmap restoreIntraResidueBonds chainAtomsGroupedByResidue - + chainAtomsGroupedByResidue :: [[PDB.Atom]] chainAtomsGroupedByResidue = groupChainByResidue chainAtoms - + convertGlobalsToLocals :: [PDB.Atom] -> [Bond PDB.Atom] -> [Bond LocalID] convertGlobalsToLocals residueAtoms = map convertGlobalToLocal where convertGlobalToLocal :: Bond PDB.Atom -> Bond LocalID - convertGlobalToLocal (Bond from to order) = + convertGlobalToLocal (Bond from to order) = Bond (LocalID $ atomToLocalIdMap ! from) (LocalID $ atomToLocalIdMap ! to) order - + atomToLocalIdMap :: Map PDB.Atom Int atomToLocalIdMap = M.fromList $ zip sortedAtoms [0..] - + sortedAtoms :: [PDB.Atom] sortedAtoms = sort residueAtoms @@ -70,19 +73,19 @@ restoreModelGlobalBonds atomToNilBasedIndex chains = convertToGlobalIDs atomToNi where convertToGlobalIDs :: Map PDB.Atom Int -> Vector (Bond PDB.Atom) -> Vector (Bond GlobalID) convertToGlobalIDs mapping = reindexBonds (\atom -> GlobalID $ mapping ! atom) - + reindexBonds :: (a -> b) -> Vector (Bond a) -> Vector (Bond b) reindexBonds convertID = fmap (\(Bond from to order) -> Bond (convertID from) (convertID to) order) chainAtomsGroupedByResidue :: Vector [[PDB.Atom]] chainAtomsGroupedByResidue = fmap groupChainByResidue chains - + _intraResidueBonds :: [Bond PDB.Atom] _intraResidueBonds = concatMap restoreChainIntraResidueBonds chainAtomsGroupedByResidue - + peptideBonds :: [Bond PDB.Atom] peptideBonds = concatMap restoreChainPeptideBonds chainAtomsGroupedByResidue - + disulfideBonds :: [Bond PDB.Atom] disulfideBonds = restoreDisulfideBonds . concat $ V.toList chainAtomsGroupedByResidue @@ -116,7 +119,7 @@ restoreChainPeptideBonds atomsGroupedByResidue = catMaybes $ restoreChainPeptide restoreChainPeptideBonds' :: [[PDB.Atom]] -> [Maybe (Bond PDB.Atom)] -> [Maybe (Bond PDB.Atom)] restoreChainPeptideBonds' [] acc = acc restoreChainPeptideBonds' [_] acc = acc - restoreChainPeptideBonds' (residue1:residue2:residues) acc = + restoreChainPeptideBonds' (residue1:residue2:residues) acc = restoreChainPeptideBonds' (residue2:residues) (constructBond residue1 residue2 : acc) constructBond :: [PDB.Atom] -> [PDB.Atom] -> Maybe (Bond PDB.Atom) @@ -129,7 +132,7 @@ restoreChainPeptideBonds atomsGroupedByResidue = catMaybes $ restoreChainPeptide guard $ distance (coords carbonAtom1) (coords nitrogenAtom2) < peptideBondMaxLength pure $ Bond carbonAtom1 nitrogenAtom2 1 - + getAtomByName :: [PDB.Atom] -> Text -> Maybe PDB.Atom getAtomByName atoms atomNameToFind = find ((atomNameToFind ==) . T.strip . PDB.atomName) atoms @@ -137,20 +140,21 @@ restoreChainIntraResidueBonds :: [[PDB.Atom]] -> [Bond PDB.Atom] restoreChainIntraResidueBonds = concatMap restoreIntraResidueBonds restoreIntraResidueBonds :: [PDB.Atom] -> [Bond PDB.Atom] -restoreIntraResidueBonds residueAtoms = catMaybes $ constructBond <$> residueBonds +restoreIntraResidueBonds [] = [] +restoreIntraResidueBonds residueAtoms@(firstAtom : _) = catMaybes $ constructBond <$> residueBonds where -- TODO: support bond order somehow constructBond :: (Text, Text) -> Maybe (Bond PDB.Atom) constructBond (fromAtomName, toAtomName) = Bond <$> constructAtom fromAtomName <*> constructAtom toAtomName <*> Just 1 - + constructAtom :: Text -> Maybe PDB.Atom constructAtom atomName = atomNameToAtom !? atomName - + atomNameToAtom :: Map Text PDB.Atom atomNameToAtom = M.fromList $ (\atom@PDB.Atom{..} -> (T.strip atomName, atom)) <$> residueAtoms - + residueBonds :: [(Text, Text)] - residueBonds = intraResidueBonds . T.strip . PDB.atomResName $ head residueAtoms + residueBonds = intraResidueBonds . T.strip . PDB.atomResName $ firstAtom intraResidueBonds :: Text -> [(Text, Text)] intraResidueBonds "NMA" = [("CA", "N")] diff --git a/src/Bio/PDB/Functions.hs b/src/Bio/PDB/Functions.hs index f7fc3c6..0adefa2 100644 --- a/src/Bio/PDB/Functions.hs +++ b/src/Bio/PDB/Functions.hs @@ -7,20 +7,21 @@ import qualified Bio.Utils.Map as M ((!?!)) import Data.Map.Strict (Map) import qualified Data.Map.Strict as M (fromList) -import Data.List (groupBy, +import Data.List (groupBy, sortOn) import Data.Vector (Vector) import qualified Data.Vector as V (toList) import Data.Char (toUpper) groupChainByResidue :: Vector PDB.Atom -> [[PDB.Atom]] -groupChainByResidue = sortOn (sortOnResidue . head) . groupBy atomsFromSameResidue . V.toList - where +groupChainByResidue = sortOn sortOnResidue . groupBy atomsFromSameResidue . V.toList + where atomsFromSameResidue :: PDB.Atom -> PDB.Atom -> Bool atomsFromSameResidue atom1 atom2 = PDB.atomResSeq atom1 == PDB.atomResSeq atom2 && PDB.atomICode atom1 == PDB.atomICode atom2 - - sortOnResidue :: PDB.Atom -> Int - sortOnResidue PDB.Atom{..} = atomSerial * 100 + (insertionCodeSortingCorrections M.!?! toUpper atomICode) - + + sortOnResidue :: [PDB.Atom] -> Int + sortOnResidue [] = -1000000 + sortOnResidue (PDB.Atom{..} : _) = atomSerial * 100 + (insertionCodeSortingCorrections M.!?! toUpper atomICode) + insertionCodeSortingCorrections :: Map Char Int insertionCodeSortingCorrections = M.fromList $ zip (' ':['A'..'Z']) [0..] diff --git a/src/Bio/PDB/Reader.hs b/src/Bio/PDB/Reader.hs index f79b97c..0625e2c 100644 --- a/src/Bio/PDB/Reader.hs +++ b/src/Bio/PDB/Reader.hs @@ -48,7 +48,7 @@ isMdlLine line = elem (T.take 6 line) modelStrings || elem (T.take 5 line) model checkRow :: [Int] -> Bool checkRow [] = True -checkRow xs = last xs - head xs + 1 == L.length xs +checkRow row@(x : _) = last row - x + 1 == L.length row checkMdlLines :: ([PDBWarnings], Text) -> Bool checkMdlLines warnings'n'text = checkRow mdlLineNumbers diff --git a/src/Bio/Sequence/Functions/Marking.hs b/src/Bio/Sequence/Functions/Marking.hs index a25017b..1430068 100644 --- a/src/Bio/Sequence/Functions/Marking.hs +++ b/src/Bio/Sequence/Functions/Marking.hs @@ -16,7 +16,7 @@ import Data.List (nub) import Data.List.NonEmpty (NonEmpty (..)) import Data.Text (Text) import qualified Data.Vector as V (toList) -import Prelude hiding (drop, head, length, null, reverse, tail, take, (!!)) +import Prelude hiding (length) import Bio.NucleicAcid.Nucleotide (Complementary (..)) import Bio.Sequence.Class (ContainsMarking, IsBareSequence, IsMarkedSequence, diff --git a/src/Bio/Sequence/Functions/Weight.hs b/src/Bio/Sequence/Functions/Weight.hs index bac8e8f..f9caadb 100644 --- a/src/Bio/Sequence/Functions/Weight.hs +++ b/src/Bio/Sequence/Functions/Weight.hs @@ -25,9 +25,7 @@ import Data.Text (Text) import Data.Vector (Vector) import qualified Data.Vector as V (drop, length, take, toList, (!)) -import Prelude hiding (drop, head, length, - null, reverse, tail, take, - (!!)) +import Prelude hiding (drop, length, take, (!!)) -- | Range of form [a, b]. -- diff --git a/src/Bio/Uniprot/Parser.hs b/src/Bio/Uniprot/Parser.hs index aa1d1ad..7854aa5 100644 --- a/src/Bio/Uniprot/Parser.hs +++ b/src/Bio/Uniprot/Parser.hs @@ -8,7 +8,7 @@ module Bio.Uniprot.Parser where import Prelude hiding (null) -import qualified Prelude as P (concat, init, last, null, tail) +import qualified Prelude as P (concat, init, last) #if MIN_VERSION_base(4, 18, 0) import Control.Applicative ((<|>)) @@ -127,7 +127,9 @@ parseDE = do string "=" pure () result <- pack . P.init <$> many1 (satisfy (not . isEndOfLine)) - pure . head $ " {ECO" `splitOn` result + case " {ECO" `splitOn` result of + [] -> pure "" + (txt : _) -> pure txt -- Parses internal DE entities parseInternal :: Text -> Parser DE @@ -235,9 +237,11 @@ parseOH = do taxId <- pack <$> many1 (notChar ';') char ';' hostName' <- many' (satisfy $ not . isEndOfLine) - let hostName = pack $ if P.null hostName' - then "" - else P.tail . P.init $ hostName' + let hostName = pack $ + case hostName' of + [] -> "" + [x] -> "" + (x : xs) -> init xs pure OH{..} -- | Parses RN, RP, RC, RX, RG, RA, RT and RL lines of UniProt-KB text file. @@ -300,7 +304,11 @@ parseCC = do topic <- pack <$> many1 (notChar ':') char ':' (char ' ' $> ()) <|> (endOfLine >> string "CC" >> count 7 space $> ()) - comment <- head . (" {ECO" `splitOn`) . pack <$> parseMultiLineComment "CC" 7 + splitedComments <- (" {ECO" `splitOn`) . pack <$> parseMultiLineComment "CC" 7 + let comment = + case splitedComments of + [] -> "" + (x : _) -> x pure CC{..} -- | UniProt-KB copyright comment @@ -494,7 +502,10 @@ parseBreak txt = ((endOfLine >> string txt >> string " ") <|> string " ") $> ( parseDefItem :: Text -> Parser Text parseDefItem name = do string name >> char '=' - head . (" {" `splitOn`) . pack <$> parseTillChar ';' + txt <- (" {" `splitOn`) . pack <$> parseTillChar ';' + case txt of + [] -> return "" + (x : _) -> return x -- | Parses line till specific char (e.g. semicolon or dot) before space/endOfLine/endOfInput. parseTillChar :: Char -> Parser String @@ -524,8 +535,8 @@ hyphenConcat [x] = x hyphenConcat (x:y:ys) = x ++ hyphenConcat (sy:ys) where sy :: String - sy | last x == '-' = tail y - | isAA (last x) && isAA (y !! 1) = tail y + sy | last x == '-' = drop 1 y + | isAA (last x) && isAA (y !! 1) = drop 1 y | otherwise = y isAA :: Char -> Bool diff --git a/test/GB/pIntA-TRBV.gb b/test/GB/pIntA-TRBV.gb index 5080990..6c7874c 100644 --- a/test/GB/pIntA-TRBV.gb +++ b/test/GB/pIntA-TRBV.gb @@ -1,6 +1,6 @@ -LOCUS P2-32_pIntA-TRBV5-1_J1-1-Fc-lama-knob-EPEA 5960 bp circular 27-NOV-2020 -SOURCE - ORGANISM +LOCUS P2-32_pIntA-TRBV5-1_J1-1-Fc-lama-knob-EPEA 5960 bp circular 27-NOV-2020 +SOURCE + ORGANISM COMMENT This file is created by Vector NTI http://www.invitrogen.com/ COMMENT ORIGDB|GenBank @@ -9,7 +9,7 @@ COMMENT VNTDBDATE|-13020134| COMMENT LSOWNER| COMMENT VNTNAME|pIntA-mIgK-GFP-stuffer-KpnI-EPEA| COMMENT VNTAUTHORNAME|Demo User| -BASE COUNT 1711 a 1577 c 1543 g 1830 t +BASE COUNT 1711 a 1577 c 1543 g 1830 t EMBL_ID P1-79_pIntA-mIgK-GFP-stuffer-KpnI-EPEA 5945 bp circular 14-OCT-2020 EMBL_ID P1-79_pIntA-mIgK-GFP-stuffer-KpnI-EPEA 6661 bp circular 29-JAN-2020 EMBL_ID pIntA-mIgK-GFP-s 6661 bp DNA circular 4-OCT-2019 diff --git a/test/GB/spaces_in_locus.gb b/test/GB/spaces_in_locus.gb new file mode 100644 index 0000000..13ab1f5 --- /dev/null +++ b/test/GB/spaces_in_locus.gb @@ -0,0 +1,20 @@ +LOCUS BLABLA BLABLA BLABLA 100 bp DNA circular SYN 11-SEP-2024 +DEFINITION synthetic circular DNA. +ACCESSION . +VERSION . +KEYWORDS . +SOURCE synthetic DNA construct + ORGANISM synthetic DNA construct +REFERENCE 1 (bases 1 to 100) + AUTHORS hhh + TITLE Direct Submission + JOURNAL Exported Sep 12, 2024 from SnapGene 6.1.0 + https://www.snapgene.com +FEATURES Location/Qualifiers + source 1..100 + /mol_type="other DNA" + /organism="plasmid" +ORIGIN + 1 cctacagcgt cctacagcgt cctacagcgt cctacagcgt cctacagcgt cctacagcgt + 61 cctacagcgt cctacagcgt cctacagcgt cctacagcgt +// diff --git a/test/GBParserSpec.hs b/test/GBParserSpec.hs index b27ee0e..225634f 100644 --- a/test/GBParserSpec.hs +++ b/test/GBParserSpec.hs @@ -20,50 +20,51 @@ gbParserSpec = describe "GenBank format parser." $ do dottedMetaSpecP "test/GB/pAAV-GFP-CellBioLab-dots.gb" unknownFieldsSpecP "test/GB/pIntA-TRBV.gb" baseCountWithSophisticatedRangesAndMultilineFeatures "test/GB/fromYanaWithLove.gb" + spacesInLocusName "test/GB/spaces_in_locus.gb" rangeTests :: Spec rangeTests = describe "Range parser" $ do - it "correctly parses a simple span" $ + it "correctly parses a simple span" $ greedyRangeP "69..420" `shouldBe` successful (Span (RangeBorder Precise 69) (RangeBorder Precise 420)) it "correctly parses a span with the lower border exceeded" $ greedyRangeP "<69..420" `shouldBe` successful (Span (RangeBorder Exceeded 69) (RangeBorder Precise 420)) it "correctly parses a span with the upper border exceeded" $ greedyRangeP "69..>420" `shouldBe` successful (Span (RangeBorder Precise 69) (RangeBorder Exceeded 420)) - it "correctly parses a span with both border exceeded" $ + it "correctly parses a span with both border exceeded" $ greedyRangeP "<69..>420" `shouldBe` successful (Span (RangeBorder Exceeded 69) (RangeBorder Exceeded 420)) - it "does not parse a span with the lower border exceeded incorrectly" $ + it "does not parse a span with the lower border exceeded incorrectly" $ greedyRangeP ">69..420" `shouldSatisfy` isFail - it "does not parse a span with the upper border exceeded incorrectly" $ + it "does not parse a span with the upper border exceeded incorrectly" $ greedyRangeP "69..<420" `shouldSatisfy` isFail - it "correctly parses a 'between' statement" $ + it "correctly parses a 'between' statement" $ greedyRangeP "41^42" `shouldBe` successful (Between 41 42) - it "does not parse a 'between' statement witn border excession marks" $ + it "does not parse a 'between' statement witn border excession marks" $ greedyRangeP "<41^42" `shouldSatisfy` isFail - it "correctly parses a single point feature" $ + it "correctly parses a single point feature" $ greedyRangeP "42" `shouldBe` successful (Point 42) - it "does not parse a single point feature with border excession marks" $ + it "does not parse a single point feature with border excession marks" $ greedyRangeP "<3" `shouldSatisfy` isFail - it "correctly parses a join() statement" $ + it "correctly parses a join() statement" $ greedyRangeP "join(2,12..56)" `shouldBe` successful (Join [Point 2, Span (RangeBorder Precise 12) (RangeBorder Precise 56)]) - it "correctly parses a sophisticated join() statement" $ + it "correctly parses a sophisticated join() statement" $ greedyRangeP "join(2^3,<5..10,15,20..>28)" `shouldBe` successful (Join [Between 2 3, Span (RangeBorder Exceeded 5) (RangeBorder Precise 10), Point 15, Span (RangeBorder Precise 20) (RangeBorder Exceeded 28)]) - it "correctly parses a complement() statement" $ + it "correctly parses a complement() statement" $ greedyRangeP "complement(69..>420)" `shouldBe` successful (Complement (Span (RangeBorder Precise 69) (RangeBorder Exceeded 420))) - it "correctly parses a join() incorporated into a complement()" $ + it "correctly parses a join() incorporated into a complement()" $ greedyRangeP "complement(join(2^3,<5..10,15,20..>28))" `shouldBe` successful (Complement (Join [Between 2 3, Span (RangeBorder Exceeded 5) (RangeBorder Precise 10), Point 15, Span (RangeBorder Precise 20) (RangeBorder Exceeded 28)])) where greedyRangeP :: Text -> Either String Range greedyRangeP = over _Left errorBundlePretty . parse (rangeP <* eof) "" - + successful :: a -> Either String a successful = Right isFail :: Either String a -> Bool - isFail = null + isFail = null pAAVGFPSpecP :: FilePath -> Spec pAAVGFPSpecP path = describe "pAAVGFP" $ do @@ -101,7 +102,7 @@ baseCountWithSophisticatedRangesAndMultilineFeatures :: FilePath -> Spec baseCountWithSophisticatedRangesAndMultilineFeatures path = describe "" $ do it "correctly parses the 'BASE COUNT' line and features with sophisticated ranges" $ do gbS <- gbSeq <$> fromFile path - gbS `shouldBe` unsafeMarkedSequence sophisticatedFeaturesSeq sophisticatedFeatures + gbS `shouldBe` unsafeMarkedSequence sophisticatedFeaturesSeq sophisticatedFeatures compPreciseSpan :: (Int, Int) -> Range compPreciseSpan = Complement . preciseSpan @@ -218,20 +219,20 @@ sophisticatedFeaturesSeq :: String sophisticatedFeaturesSeq = "cctacagcgtgagctatgagaaagcgccacgcttcccgaagggagaaaggcggacaggtatccggtaagcggcagggtcggaacaggagagcgcacgagggagcttccagggggaaacgcctggtatctttatagtcctgtcgggtttcgccacctctgacttgagcgtcgatttttgtgatgctcgtcaggggggcggagcctatggaaaaacgccagcaacgcggcctttttacggttcctggccttttgctggccttttgctcacatgttctttcctgcgttatcccctgattctgtggataaccgtattaccgcctttgagtgagctgataccgctcgccgcagccgaacgaccgagcgcagcgagtcagtgagcgaggaagcgtacatttatattggctcatgtccaatatgaccgccatgttgacattgattattgactagaccgcgttacataacttacggtaaatggcccgcctggctgaccgcccaacgacccccgcccattgacgtcaataatgacgtatgttcccatagtaacgccaatagggactttccattgacgtcaatgggtggagtatttacggtaaactgcccacttggcagtacatcaagtgtatcatatgccaagtacgccccctattgacgtcaatgacggtaaatggcccgcctggcattatgcccagtacatgaccttatgggactttcctacttggcagtacatctacgtattagtcatcgctattaccatggtgatgcggttttggcagtacatcaatgggcgtggatagcggtttgactcacggggatttccaagtctccaccccattgacgtcaatgggagtttgttttggcaccaaaatcaacgggactttccaaaatgtcgtaacaactccgccccattgacgcaaatgggcggtaggcgtgtacggtgggaggtctatataagcagagctcgtttagtgaaccgtcagatcgcctggagacgccatccacgctgttttgacctccatagaagacaccgggaccgatccagcctccgcggccgggaacggtgcattggaacgcggattccccgtgccaagagtgacgtaagtaccgcctatagagtctataggcccacccccttggcttcttatgcatgctatactgtttttggcttggggtctatacacccccgcttcctcatgttataggtgatggtatagcttagcctataggtgtgggttattgaccattattgaccactcccctattggtgacgatactttccattactaatccataacatggctctttgccacaactctctttattggctatatgccaatacactgtccttcagagactgacacggactctgtatttttacaggatggggtctcatttattatttacaaattcacatatacaacaccaccgtccccagtgcccgcagtttttattaaacataacgtgggatctccacgcgaatctcgggtacgtgttccggacatgggctcatctccggtagcggcggagcttctacatccgagccctgctcccatgcctccagcgactcatggtcgctcggcagctccttgctcctaacagtggaggccagacttaggcacagcacgatgcccaccaccaccagtgtgccgcacaaggccgtggcggtagggtatgtgtctgaaaatgagctcggggagcgggcttgcaccgctgacgcatttggaagacttaaggcagcggcagaagaagatgcaggcagctgagttgttgtgttctgataagagtcagaggtaactcccgttgcggtgctgttaacggtggagggcagtgtagtctgagcagtactcgttgctgccgcgcgcgccaccagacataatagctgacagactaacagactgttcctttccatgggtcttttctgcagtcaccgtccttgacacgaagcttgccgccaccatggagaccgacaccctgctgctgtgggtgctgctgctgtgggtgcccgggtcgacgaagagctcatgagcggatacatatttgaatgtatttagaaaaataaacaaataggggtcagtgttacaaccaattaaccaattctgaacattatcgcgagcccatttatacctgaatatggctcataacaccccttgtttgcctggcggcagtagcgcggtggtcccacctgaccccatgccgaactcagaagtgaaacgccgtagcgccgatggtagtgtggggactccccatgcgagagtagggaactgccaggcatcaaataaaacgaaaggctcagtcgaaagactgggcctttcgcccgggctaattatggggtgtcgcccttggggtgagaccctcgagtgtacagaattcttactgatacgtgtccagatcaaccgctttcacgacctctaccagacacatgtgatcacggcgctcgtcgcggtctttgctcagtttggtgtggtaggtaatgtgatgataacgcgggatatgcactgccgcggagcccgccaacggacgattcatttggctgcatttggtaaccagtttttcggtcacaccttcaatatcgtacgcctggttgaactcaacgcggatgccattgttaacggtgtcaggcagaatatacagaatgcttggcgggcattggaatgcaacgttcttacgcagaatgtgaccgtctttcttaaagttctcaccagtcagcgtgacacgattgtagatagaaccgcgttcgtaggtaaccatagcacgcgtcttgtacacgccgtcgccttcgaagctgatggtacgctcttgggtataaccttccggcatggcgctcttaaagaaatccttgatgtggctcgggtacttgcgaaacactgaacaccgtagctcagggtgctcaccagggttgcccacgggacccggcaggtcgcccgtagtgcagatgtatttcgctttaatggtacccgtggtcgcgtcacccggtaccctcgcctttaatgataaatttcataccttcgacgtcgccttccagttcggtgatatacgggatctctttctcaaacagttttgcaccttccgtcaatgccgtcatatgtttacctcctaaggtctcgaaaagttaaacaaaattatttctaaagggaaaccgttgtggaattgtgagcgctcacaattccacatattataattgttatccgctcacaaagcaaataaatttttcatgatttcactgtgcatgaagctcgtaattgttatccgctcacaattaagggcgacacaaaatttattctaaatgataataaatactgataacatcttatagtttgtattatattttgtattatcgttgacatgtataattttgatatcaaaaactgattttccctttattattttcgagatttattttcttaattctctttaacaaactagaaatattgtatatacaaaaaatcataaataatagatgaatagtttaattataggtgttcatcaatcgaaaaagcaacgtatcttatttaaagtgcgttgcttttttctcatttataaggttaaataattctcatatatcaagcaaagtgacaggcgcccttaaatattctgacaaatgctctttccctaaactccccccataaaaaaacccgccgaagcgggtttttacgttatttgcggattaacgattactcgttatcagaaccgcccagggggcccgagcttaagactggccgtcgttttacaacacaagctcttccgtacggtggctgcaccatctgtcttcatcttcccgccatctgatgagcagttgaaatctggaactgcctctgttgtgtgcctgctgaataacttctatcccagagaggccaaagtacagtggaaggtggataacgccctccaatcgggtaactcccaggagagtgtcacagagcaggacagcaaggacagcacctacagcctcagcagcaccctgacgctgagcaaagcagactacgagaaacacaaagtctacgcctgcgaagtcacccatcagggcctgagctcgcccgtcacaaagagcttcaacaggggagagtgttaatagtctagacctaggtgatcataatcagccataccacatttgtagaggttttacttgctttaaaaaacctcccacacctccccctgaacctgaaacataaaatgaatgcaattgttgttgttaacttgtttattgcagcttataatggttacaaataaagcaatagcatcacaaatttcacaaataaagcatttttttcactgcattctagttgtggtttgtccaaactcatcaatgtatcttatcatgtctggagatctctagctagaggatcgatccccgccccggacgaactaaacctgactacgacatctctgccccttcttcgcggggcagtgcatgtaatcccttcagttggttggtacaacttgccaactgaaccctaaacgggtagcatatgcttcccgggtagtagtatatactatccagactaaccctaattcaatagcatatgttacccaacgggaagcatatgctatcgaattagggttagtaaaagggtcctaaggaacagcgatgtaggtgggcgggccaagataggggcgcgattgctgcgatctggaggacaaattacacacacttgcgcctgagcgccaagcacagggttgttggtcctcatattcacgaggtcgctgagagcacggtgggctaatgttgccatgggtagcatatactacccaaatatctggatagcatatgctatcctaatctatatctgggtagcataggctatcctaatctatatctgggtagcatatgctatcctaatctatatctgggtagtatatgctatcctaatttatatctgggtagcataggctatcctaatctatatctgggtagcatatgctatcctaatctatatctgggtagtatatgctatcctaatctgtatccgggtagcatatgctatcctaatagagattagggtagtatatgctatcctaatttatatctgggtagcatatactacccaaatatctggatagcatatgctatcctaatctatatctgggtagcatatgctatcctaatctatatctgggtagcataggctatcctaatctatatctgggtagcatatgctatcctaatctatatctgggtagtatatgctatcctaatttatatctgggtagcataggctatcctaatctatatctgggtagcatatgctatcctaatctatatctgggtagtatatgctatcctaatctgtatccgggtagcatatgctatcctcatgataagctgtcaaacatgagaattaattcttgaagacgaaagggcctcgtgatacgcctatttttataggttaatgtcatgataataatggtttcttagacgtcaggtggcacttttcggggaaatgtgcgcggaacccctatttgtttatttttctaaatacattcaaatatgtatccgctcatgagacaataaccctgataaatgcttcaataatattgaaaaaggaagagtatgagtattcaacatttccgtgtcgcccttattcccttttttgcggcattttgccttcctgtttttgctcacccagaaacgctggtgaaagtaaaagatgctgaagatcagttgggtgcacgagtgggttacatcgaactggatctcaacagcggtaagatccttgagagttttcgccccgaagaacgttttccaatgatgagcacttttaaagttctgctatgtggcgcggtattatcccgtgttgacgccgggcaagagcaactcggtcgccgcatacactattctcagaatgacttggttgagtactcaccagtcacagaaaagcatcttacggatggcatgacagtaagagaattatgcagtgctgccataaccatgagtgataacactgcggccaacttacttctgacaacgatcggaggaccgaaggagctaaccgcttttttgcacaacatgggggatcatgtaactcgccttgatcgttgggaaccggagctgaatgaagccataccaaacgacgagcgtgacaccacgatgcctgcagcaatggcaacaacgttgcgcaaactattaactggcgaactacttactctagcttcccggcaacaattaatagactggatggaggcggataaagttgcaggaccacttctgcgctcggcccttccggctggctggtttattgctgataaatctggagccggtgagcgtgggtctcgcggtatcattgcagcactggggccagatggtaagccctcccgtatcgtagttatctacacgacggggagtcaggcaactatggatgaacgaaatagacagatcgctgagataggtgcctcactgattaagcattggtaactgtcagaccaagtttactcatatatactttagattgatttaaaacttcatttttaatttaaaaggatctaggtgaagatcctttttgataatctcatgaccaaaatcccttaacgtgagttttcgttccactgagcgtcagaccccgtagaaaagatcaaaggatcttcttgagatcctttttttctgcgcgtaatctgctgcttgcaaacaaaaaaaccaccgctaccagcggtggtttgtttgccggatcaagagctaccaactctttttccgaaggtaactggcttcagcagagcgcagataccaaatactgttcttctagtgtagccgtagttaggccaccacttcaagaactctgtagcaccgcctacatacctcgctctgctaatcctgttaccagtggctgctgccagtggcgataagtcgtgtcttaccgggttggactcaagacgatagttaccggataaggcgcagcggtcgggctgaacggggggttcgtgcacacagcccagcttggagcgaacgacctacaccgaactgagata" sophisticatedFeatures :: [(Feature, Range)] -sophisticatedFeatures = - [ (Feature "source" +sophisticatedFeatures = + [ (Feature "source" [ ("organism", "synthetic DNA construct") , ("mol_type", "other DNA") ] , preciseSpan (0, 6950)) - , (Feature "rep_origin" + , (Feature "rep_origin" [ ("label", "pUCorigin and also a multiline property") , ("note", "/vntifkey=33") ] , Join [Point 0, preciseSpan (6550, 6950)]) - , (Feature "enhancer" + , (Feature "enhancer" [ ("label", "cmv enhanser") , ("label", "cmv\\enhanser") , ("note", "/vntifkey=9") @@ -251,7 +252,7 @@ sophisticatedFeatures = ] , preciseSpan (1011, 1918)) - , (Feature "primer_bind" + , (Feature "primer_bind" [ ("label", "inv olig1") ] , preciseSpan (1501, 1521)) @@ -269,7 +270,7 @@ sophisticatedFeatures = ] , preciseSpan (1953, 2009)) - , (Feature "misc_feature" + , (Feature "misc_feature" [ ("label", "START") , ("note", "START") , ("note", "/ugene_name=START") @@ -277,7 +278,7 @@ sophisticatedFeatures = ] , preciseSpan (1953, 1955)) - , (Feature "misc_feature" + , (Feature "misc_feature" [ ("label", "GFP stuffer") , ("note", "GFP stuffer") , ("note", "/ugene_name=GFP\\ stuffer") @@ -285,7 +286,7 @@ sophisticatedFeatures = ] , preciseSpan (2010, 3738)) - , (Feature "misc_feature" + , (Feature "misc_feature" [ ("label", "CK") , ("note", "CK") , ("note", "/ugene_name=CK") @@ -293,7 +294,7 @@ sophisticatedFeatures = ] , preciseSpan (3739, 4059)) - , (Feature "misc_feature" + , (Feature "misc_feature" [ ("label", "STOP") , ("note", "STOP") , ("note", "/ugene_name=STOP") @@ -301,7 +302,7 @@ sophisticatedFeatures = ] , preciseSpan (4060, 4065)) - , (Feature "misc_feature" + , (Feature "misc_feature" [ ("gene", "SV40_PA term") , ("label", "SV40_PA term") , ("label", "SV40_PA\\term") @@ -309,18 +310,18 @@ sophisticatedFeatures = ] , preciseSpan (4078, 4316)) - , (Feature "primer_bind" + , (Feature "primer_bind" [ ("label", "pEE_Clab") ] , preciseSpan (4349, 4369)) - , (Feature "rep_origin" + , (Feature "rep_origin" [ ("label", "EBV ori") , ("label", "EBV\\ori") , ("note", "/vntifkey=33") ] , preciseSpan (4581, 4974)) - , (Feature "CDS" + , (Feature "CDS" [ ("codon_start", "1") , ("label", "AmpR") , ("note", "/vntifkey=4") @@ -328,3 +329,10 @@ sophisticatedFeatures = ] , preciseSpan (5542, 6402)) ] + +spacesInLocusName :: FilePath -> Spec +spacesInLocusName path = + describe "should correct parse gb with spaces in plasmid name in locus" $ + it "" $ do + mt <- meta <$> fromFile path + name (locus mt) `shouldBe` "BLABLA BLABLA BLABLA" \ No newline at end of file