Logo Search packages:      
Sourcecode: jalview version File versions

SeqCigar.java

/*
 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.7)
 * Copyright (C) 2011 J Procter, AM Waterhouse, G Barton, M Clamp, S Searle
 * 
 * This file is part of Jalview.
 * 
 * Jalview is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License 
 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 * 
 * Jalview is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty 
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 
 * PURPOSE.  See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
 */
package jalview.datamodel;

import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

import jalview.analysis.*;
import jalview.util.*;

00027 public class SeqCigar extends CigarSimple
{
  /**
   * start(inclusive) and end(exclusive) of subsequence on refseq
   */
00032   private int start, end;

  private SequenceI refseq = null;

  private Hashtable seqProps;

  /**
   * Reference dataset sequence for the cigar string
   * 
   * @return SequenceI
   */
00043   public SequenceI getRefSeq()
  {
    return refseq;
  }

  /**
   * 
   * @return int start index of cigar ops on refSeq
   */
00052   public int getStart()
  {
    return start;
  }

  /**
   * 
   * @return int end index (exclusive) of cigar ops on refSeq
   */
00061   public int getEnd()
  {
    return end;
  }

  /**
   * Returns sequence as a string with cigar operations applied to it
   * 
   * @return String
   */
00071   public String getSequenceString(char GapChar)
  {
    return (length == 0) ? "" : (String) getSequenceAndDeletions(
            refseq.getSequenceAsString(start, end), GapChar)[0];
  }

  /**
   * recreates a gapped and edited version of RefSeq or null for an empty cigar
   * string
   * 
   * @return SequenceI
   */
00083   public SequenceI getSeq(char GapChar)
  {
    Sequence seq;
    if (refseq == null || length == 0)
    {
      return null;
    }
    Object[] edit_result = getSequenceAndDeletions(
            refseq.getSequenceAsString(start, end), GapChar);
    if (edit_result == null)
    {
      throw new Error(
              "Implementation Error - unexpected null from getSequenceAndDeletions");
    }
    int bounds[] = (int[]) edit_result[1];
    seq = new Sequence(refseq.getName(), (String) edit_result[0],
            refseq.getStart() + start + bounds[0], refseq.getStart()
                    + start + ((bounds[2] == 0) ? -1 : bounds[2]));
    seq.setDescription(refseq.getDescription());
    int sstart = seq.getStart(), send = seq.getEnd();
    // seq.checkValidRange(); probably not needed
    // recover local properties if present
    if (seqProps != null)
    {
      // this recovers dataset sequence reference as well as local features,
      // names, start/end settings.
      SeqsetUtils.SeqCharacterUnhash(seq, seqProps);
    }
    // ensure dataset sequence is up to date from local reference
    seq.setDatasetSequence(refseq);
    seq.setStart(sstart);
    seq.setEnd(send);
    return seq;
  }

  /*
   * We don't allow this - refseq is given at construction time only public void
   * setSeq(SequenceI seq) { this.seq = seq; }
   */
  /**
   * internal constructor - sets seq to a gapless sequence derived from seq and
   * prepends any 'D' operations needed to get to the first residue of seq.
   * 
   * @param seq
   *          SequenceI
   * @param initialDeletion
   *          true to mark initial dataset sequence residues as deleted in
   *          subsequence
   * @param _s
   *          index of first position in seq
   * @param _e
   *          index after last position in (possibly gapped) seq
   * @return true if gaps are present in seq
   */
00137   private boolean _setSeq(SequenceI seq, boolean initialDeletion, int _s,
          int _e)
  {
    boolean hasgaps = false;
    if (seq == null)
    {
      throw new Error("Implementation Error - _setSeq(null,...)");
    }
    if (_s < 0)
    {
      throw new Error("Implementation Error: _s=" + _s);
    }
    String seq_string = seq.getSequenceAsString();
    if (_e == 0 || _e < _s || _e > seq_string.length())
    {
      _e = seq_string.length();
    }
    // resolve start and end positions relative to ungapped reference sequence
    start = seq.findPosition(_s) - seq.getStart();
    end = seq.findPosition(_e) - seq.getStart();
    int l_ungapped = end - start;
    // Find correct sequence to reference and correct start and end - if
    // necessary
    SequenceI ds = seq.getDatasetSequence();
    if (ds == null)
    {
      // make a new dataset sequence
      String ungapped = AlignSeq.extractGaps(
              jalview.util.Comparison.GapChars, new String(seq_string));
      l_ungapped = ungapped.length();
      // check that we haven't just duplicated an ungapped sequence.
      if (l_ungapped == seq.getLength())
      {
        ds = seq;
      }
      else
      {
        ds = new Sequence(seq.getName(), ungapped, seq.getStart(),
                seq.getStart() + ungapped.length() - 1);
        // JBPNote: this would be consistent but may not be useful
        // seq.setDatasetSequence(ds);
      }
    }
    // add in offset between seq and the dataset sequence
    if (ds.getStart() < seq.getStart())
    {
      int offset = seq.getStart() - ds.getStart();
      if (initialDeletion)
      {
        // absolute cigar string
        addDeleted(_s + offset);
        start = 0;
        end += offset;
      }
      else
      {
        // normal behaviour - just mark start and end subsequence
        start += offset;
        end += offset;

      }

    }

    // any gaps to process ?
    if (l_ungapped != (_e - _s))
    {
      hasgaps = true;
    }

    refseq = ds;
    // copy over local properties for the sequence instance of the refseq
    seqProps = SeqsetUtils.SeqCharacterHash(seq);
    // Check offsets
    if (end > ds.getLength())
    {
      throw new Error(
              "SeqCigar: Possible implementation error: sequence is longer than dataset sequence");
      // end = ds.getLength();
    }

    return hasgaps;
  }

  /**
   * directly initialise a cigar object with a sequence of range, operation
   * pairs and a sequence to apply it to. operation and range should be relative
   * to the seq.getStart()'th residue of the dataset seq resolved from seq.
   * 
   * @param seq
   *          SequenceI
   * @param operation
   *          char[]
   * @param range
   *          int[]
   */
00233   public SeqCigar(SequenceI seq, char operation[], int range[])
  {
    super();
    if (seq == null)
    {
      throw new Error("Implementation Bug. Null seq !");
    }
    if (operation.length != range.length)
    {
      throw new Error(
              "Implementation Bug. Cigar Operation list!= range list");
    }

    if (operation != null)
    {
      this.operation = new char[operation.length + _inc_length];
      this.range = new int[operation.length + _inc_length];

      if (_setSeq(seq, false, 0, 0))
      {
        throw new Error(
                "NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
      }
      for (int i = this.length, j = 0; j < operation.length; i++, j++)
      {
        char op = operation[j];
        if (op != M && op != I && op != D)
        {
          throw new Error("Implementation Bug. Cigar Operation '" + j
                  + "' '" + op + "' not one of '" + M + "', '" + I
                  + "', or '" + D + "'.");
        }
        this.operation[i] = op;
        this.range[i] = range[j];
      }
      this.length += operation.length;
    }
    else
    {
      this.operation = null;
      this.range = null;
      this.length = 0;
      if (_setSeq(seq, false, 0, 0))
      {
        throw new Error(
                "NOT YET Implemented: Constructing a Cigar object from a cigar string and a gapped sequence.");
      }
    }
  }

  /**
   * add range matched residues to cigar string
   * 
   * @param range
   *          int
   */
00289   public void addMatch(int range)
  {
    this.addOperation(M, range);
  }

  /**
   * Adds insertion and match operations based on seq to the cigar up to the
   * endpos column of seq.
   * 
   * @param cigar
   *          CigarBase
   * @param seq
   *          SequenceI
   * @param startpos
   *          int
   * @param endpos
   *          int
   * @param initialDeletions
   *          if true then initial deletions will be added from start of seq to
   *          startpos
   */
00310   protected static void addSequenceOps(CigarBase cigar, SequenceI seq,
          int startpos, int endpos, boolean initialDeletions)
  {
    char op = '\0';
    int range = 0;
    int p = 0, res = seq.getLength();

    if (!initialDeletions)
    {
      p = startpos;
    }

    while (p <= endpos)
    {
      boolean isGap = (p < res) ? jalview.util.Comparison.isGap(seq
              .getCharAt(p)) : true;
      if ((startpos <= p) && (p <= endpos))
      {
        if (isGap)
        {
          if (range > 0 && op != I)
          {
            cigar.addOperation(op, range);
            range = 0;
          }
          op = I;
          range++;
        }
        else
        {
          if (range > 0 && op != M)
          {
            cigar.addOperation(op, range);
            range = 0;
          }
          op = M;
          range++;
        }
      }
      else
      {
        if (!isGap)
        {
          if (range > 0 && op != D)
          {
            cigar.addOperation(op, range);
            range = 0;
          }
          op = D;
          range++;
        }
        else
        {
          // do nothing - insertions are not made in flanking regions
        }
      }
      p++;
    }
    if (range > 0)
    {
      cigar.addOperation(op, range);
    }
  }

  /**
   * create a cigar string for given sequence
   * 
   * @param seq
   *          SequenceI
   */
00380   public SeqCigar(SequenceI seq)
  {
    super();
    if (seq == null)
    {
      throw new Error("Implementation error for new Cigar(SequenceI)");
    }
    _setSeq(seq, false, 0, 0);
    // there is still work to do
    addSequenceOps(this, seq, 0, seq.getLength() - 1, false);
  }

  /**
   * Create Cigar from a range of gaps and residues on a sequence object
   * 
   * @param seq
   *          SequenceI
   * @param start
   *          int - first column in range
   * @param end
   *          int - last column in range
   */
00402   public SeqCigar(SequenceI seq, int start, int end)
  {
    super();
    if (seq == null)
    {
      throw new Error("Implementation error for new Cigar(SequenceI)");
    }
    _setSeq(seq, false, start, end + 1);
    // there is still work to do
    addSequenceOps(this, seq, start, end, false);
  }

  /**
   * Create a cigar object from a cigar string like '[<I|D|M><range>]+' Will
   * fail if the given seq already contains gaps (JBPNote: future implementation
   * will fix)
   * 
   * @param seq
   *          SequenceI object resolvable to a dataset sequence
   * @param cigarString
   *          String
   * @return Cigar
   */
00425   public static SeqCigar parseCigar(SequenceI seq, String cigarString)
          throws Exception
  {
    Object[] opsandrange = parseCigarString(cigarString);
    return new SeqCigar(seq, (char[]) opsandrange[0],
            (int[]) opsandrange[1]);
  }

  /**
   * create an alignment from the given array of cigar sequences and gap
   * character, and marking the given segments as visible in the given
   * columselection.
   * 
   * @param alseqs
   * @param gapCharacter
   * @param colsel
   *          - columnSelection where hidden regions are marked
   * @param segments
   *          - visible regions of alignment
   * @return SequenceI[]
   */
00446   public static SequenceI[] createAlignmentSequences(SeqCigar[] alseqs,
          char gapCharacter, ColumnSelection colsel, int[] segments)
  {
    SequenceI[] seqs = new SequenceI[alseqs.length];
    StringBuffer[] g_seqs = new StringBuffer[alseqs.length];
    String[] alseqs_string = new String[alseqs.length];
    Object[] gs_regions = new Object[alseqs.length];
    for (int i = 0; i < alseqs.length; i++)
    {
      alseqs_string[i] = alseqs[i].getRefSeq().getSequenceAsString(
              alseqs[i].start, alseqs[i].end);
      gs_regions[i] = alseqs[i].getSequenceAndDeletions(alseqs_string[i],
              gapCharacter); // gapped sequence, {start, start col, end.
      // endcol}, hidden regions {{start, end, col}})
      if (gs_regions[i] == null)
      {
        throw new Error("Implementation error: " + i
                + "'th sequence Cigar has no operations.");
      }
      g_seqs[i] = new StringBuffer((String) ((Object[]) gs_regions[i])[0]); // the
      // visible
      // gapped
      // sequence
    }
    // Now account for insertions. (well - deletions)
    // this is complicated because we must keep track of shifted positions in
    // each sequence
    ShiftList shifts = new ShiftList();
    for (int i = 0; i < alseqs.length; i++)
    {
      Object[] gs_region = ((Object[]) ((Object[]) gs_regions[i])[2]);
      if (gs_region != null)

      {
        for (int hr = 0; hr < gs_region.length; hr++)
        {
          int[] region = (int[]) gs_region[hr];
          char[] insert = new char[region[1] - region[0] + 1];
          for (int s = 0; s < insert.length; s++)
          {
            insert[s] = gapCharacter;
          }
          int inspos = shifts.shift(region[2]); // resolve insertion position in
          // current alignment frame of
          // reference
          for (int s = 0; s < alseqs.length; s++)
          {
            if (s != i)
            {
              if (g_seqs[s].length() <= inspos)
              {
                // prefix insertion with more gaps.
                for (int l = inspos - g_seqs[s].length(); l > 0; l--)
                {
                  g_seqs[s].append(gapCharacter); // to debug - use a diffferent
                  // gap character here
                }
              }
              g_seqs[s].insert(inspos, insert);
            }
            else
            {
              g_seqs[s].insert(inspos,
                      alseqs_string[i].substring(region[0], region[1] + 1));
            }
          }
          shifts.addShift(region[2], insert.length); // update shift in
          // alignment frame of
          // reference
          if (segments == null)
          {
            // add a hidden column for this deletion
            colsel.hideColumns(inspos, inspos + insert.length - 1);
          }
        }
      }
    }
    for (int i = 0; i < alseqs.length; i++)
    {
      int[] bounds = ((int[]) ((Object[]) gs_regions[i])[1]);
      SequenceI ref = alseqs[i].getRefSeq();
      seqs[i] = new Sequence(ref.getName(), g_seqs[i].toString(),
              ref.getStart() + alseqs[i].start + bounds[0], ref.getStart()
                      + alseqs[i].start + (bounds[2] == 0 ? -1 : bounds[2]));
      seqs[i].setDatasetSequence(ref);
      seqs[i].setDescription(ref.getDescription());
    }
    if (segments != null)
    {
      for (int i = 0; i < segments.length; i += 3)
      {
        // int start=shifts.shift(segments[i]-1)+1;
        // int end=shifts.shift(segments[i]+segments[i+1]-1)-1;
        colsel.hideColumns(segments[i + 1], segments[i + 1]
                + segments[i + 2] - 1);
      }
    }
    return seqs;
  }

  /**
   * non rigorous testing
   */
  /**
   * 
   * @param seq
   *          Sequence
   * @param ex_cs_gapped
   *          String
   * @return String
   */
00557   public static String testCigar_string(Sequence seq, String ex_cs_gapped)
  {
    SeqCigar c_sgapped = new SeqCigar(seq);
    String cs_gapped = c_sgapped.getCigarstring();
    if (!cs_gapped.equals(ex_cs_gapped))
    {
      System.err.println("Failed getCigarstring: incorect string '"
              + cs_gapped + "' != " + ex_cs_gapped);
    }
    return cs_gapped;
  }

  public static boolean testSeqRecovery(SeqCigar gen_sgapped,
          SequenceI s_gapped)
  {
    // this is non-rigorous - start and end recovery is not tested.
    SequenceI gen_sgapped_s = gen_sgapped.getSeq('-');
    if (!gen_sgapped_s.getSequence().equals(s_gapped.getSequence()))
    {
      System.err.println("Couldn't reconstruct sequence.\n"
              + gen_sgapped_s.getSequenceAsString() + "\n"
              + s_gapped.getSequenceAsString());
      return false;
    }
    return true;
  }

  public static void main(String argv[]) throws Exception
  {
    String o_seq;
    Sequence s = new Sequence("MySeq",
            o_seq = "asdfktryasdtqwrtsaslldddptyipqqwaslchvhttt", 39, 80);
    String orig_gapped;
    Sequence s_gapped = new Sequence(
            "MySeq",
            orig_gapped = "----asdf------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhttt",
            39, 80);
    String ex_cs_gapped = "4I4M6I6M3I11M4I12M4I9M";
    s_gapped.setDatasetSequence(s);
    String sub_gapped_s;
    Sequence s_subsequence_gapped = new Sequence(
            "MySeq",
            sub_gapped_s = "------ktryas---dtqwrtsasll----dddptyipqqwa----slchvh",
            43, 77);

    s_subsequence_gapped.setDatasetSequence(s);
    SeqCigar c_null = new SeqCigar(s);
    String cs_null = c_null.getCigarstring();
    if (!cs_null.equals("42M"))
    {
      System.err
              .println("Failed to recover ungapped sequence cigar operations:"
                      + ((cs_null == "") ? "empty string" : cs_null));
    }
    testCigar_string(s_gapped, ex_cs_gapped);
    SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
    if (!gen_sgapped.getCigarstring().equals(ex_cs_gapped))
    {
      System.err.println("Failed parseCigar(" + ex_cs_gapped
              + ")->getCigarString()->'" + gen_sgapped.getCigarstring()
              + "'");
    }
    testSeqRecovery(gen_sgapped, s_gapped);
    // Test dataset resolution
    SeqCigar sub_gapped = new SeqCigar(s_subsequence_gapped);
    if (!testSeqRecovery(sub_gapped, s_subsequence_gapped))
    {
      System.err
              .println("Failed recovery for subsequence of dataset sequence");
    }
    // width functions
    if (sub_gapped.getWidth() != sub_gapped_s.length())
    {
      System.err.println("Failed getWidth()");
    }

    sub_gapped.getFullWidth();
    if (sub_gapped.hasDeletedRegions())
    {
      System.err.println("hasDeletedRegions is incorrect.");
    }
    // Test start-end region SeqCigar
    SeqCigar sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
    if (sub_se_gp.getWidth() != 41)
    {
      System.err
              .println("SeqCigar(seq, start, end) not properly clipped alignsequence.");
    }
    System.out.println("Original sequence align:\n" + sub_gapped_s
            + "\nReconstructed window from 8 to 48\n" + "XXXXXXXX"
            + sub_se_gp.getSequenceString('-') + "..." + "\nCigar String:"
            + sub_se_gp.getCigarstring() + "\n");
    SequenceI ssgp = sub_se_gp.getSeq('-');
    System.out.println("\t " + ssgp.getSequenceAsString());
    for (int r = 0; r < 10; r++)
    {
      sub_se_gp = new SeqCigar(s_subsequence_gapped, 8, 48);
      int sl = sub_se_gp.getWidth();
      int st = sl - 1 - r;
      for (int rs = 0; rs < 10; rs++)
      {
        int e = st + rs;
        sub_se_gp.deleteRange(st, e);
        String ssgapedseq = sub_se_gp.getSeq('-').getSequenceAsString();
        System.out.println(st + "," + e + "\t:" + ssgapedseq);
        st -= 3;
      }
    }
    {
      SeqCigar[] set = new SeqCigar[]
      { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
          new SeqCigar(s_gapped) };
      Alignment al = new Alignment(set);
      for (int i = 0; i < al.getHeight(); i++)
      {
        System.out.println("" + al.getSequenceAt(i).getName() + "\t"
                + al.getSequenceAt(i).getStart() + "\t"
                + al.getSequenceAt(i).getEnd() + "\t"
                + al.getSequenceAt(i).getSequenceAsString());
      }
    }
    {
      System.out.println("Gapped.");
      SeqCigar[] set = new SeqCigar[]
      { new SeqCigar(s), new SeqCigar(s_subsequence_gapped, 8, 48),
          new SeqCigar(s_gapped) };
      set[0].deleteRange(20, 25);
      Alignment al = new Alignment(set);
      for (int i = 0; i < al.getHeight(); i++)
      {
        System.out.println("" + al.getSequenceAt(i).getName() + "\t"
                + al.getSequenceAt(i).getStart() + "\t"
                + al.getSequenceAt(i).getEnd() + "\t"
                + al.getSequenceAt(i).getSequenceAsString());
      }
    }
    // if (!ssgapedseq.equals("ryas---dtqqwa----slchvh"))
    // System.err.println("Subseqgaped\n------ktryas---dtqwrtsasll----dddptyipqqwa----slchvhryas---dtqwrtsasll--qwa----slchvh\n"+ssgapedseq+"\n"+sub_se_gp.getCigarstring());
  }

  /**
   * references to entities that this sequence cigar is associated with.
   */
00700   private Hashtable selGroups = null;

  public void setGroupMembership(Object group)
  {
    if (selGroups == null)
    {
      selGroups = new Hashtable();
    }
    selGroups.put(group, new int[0]);
  }

  /**
   * Test for and if present remove association to group.
   * 
   * @param group
   * @return true if group was associated and it was removed
   */
00717   public boolean removeGroupMembership(Object group)
  {
    if (selGroups != null && selGroups.containsKey(group))
    {
      selGroups.remove(group);
      return true;
    }
    return false;
  }

  /**
   * forget all associations for this sequence.
   */
00730   public void clearMemberships()
  {
    if (selGroups != null)
    {
      selGroups.clear();
    }
    selGroups = null;
  }

  /**
   * 
   * @return null or array of all associated entities
   */
00743   public Object[] getAllMemberships()
  {
    if (selGroups == null)
    {
      return null;
    }
    Object[] mmbs = new Object[selGroups.size()];
    Enumeration en = selGroups.keys();
    for (int i = 0; en.hasMoreElements(); i++)
    {
      mmbs[i] = en.nextElement();
    }
    return mmbs;
  }

  /**
   * Test for group membership
   * 
   * @param sgr
   *          - a selection group or some other object that may be associated
   *          with seqCigar
   * @return true if sgr is associated with this seqCigar
   */
00766   public boolean isMemberOf(Object sgr)
  {
    return (selGroups != null) && selGroups.get(sgr) != null;
  }
}

Generated by  Doxygen 1.6.0   Back to index